[speex-dev] [PATCH] Make SSE Run Time option. Add Win32 SSE code

Aron Rosenberg aron at sightspeed.com
Thu Jan 8 21:18:25 PST 2004


         Attached is a patch that does two things. First it makes the use 
of the current SSE code a run time option through the use 
of  speex_decoder_ctl() and speex_encoder_ctl
It does this twofold. First there is a modification to the configure.in 
script which introduces a check based upon platform. It will compile in the 
sse assembly if you are on an i?86 based platform by making a special 
define. Second, it adds a new ctl value called SPEEX_SET_ASM_FLAG  which 
takes in an integer. The values are defined as:

#define SPEEX_SET_ASM_FLAG              200
#define SPEEX_ASM_MMX_NONE              0
#define SPEEX_ASM_MMX_BASIC             1
#define SPEEX_ASM_MMX_SSE               2
#define SPEEX_ASM_MMX_SSE_FP    4

The current Speex SSE code requires full SSE2 support which corresponds to 
SPEEX_ASM_MMX_SSE_FP. None of the other defines are actively used, but they 
are included since they represent different Intel/AMD processors. For 
example, an AMD Duran only supports SPEEX_ASM_MMX_BASIC while Pentium 3's 
and above support full SPEEX_ASM_MMX_SSE_FP

<p>The second part of the patch adds the equivalent MS Windows assembler for 
the same sections that currently have GCC x86 assembler code.

Notes about implementation: We took the easiest route when hacking in the 
flag support which was to add a global flag for the entire library at 
runtime and extern it in all the various files.
Jean-Marc: We looked at adding the flag into the state structures, however 
they were not passed all the way down into the filters.c files and it would 
have been a massive change to make it pass all the needed data. The 
approach we took should be ok since on a given machine you would have the 
same settings. The decoder_ctl and encoder_ctl set the same global flag 

The way we setup the asm flags var should allow you to add the ARM assembly 
in the exact same manor. You would add a check in the configure.in for the 
platform and define a _USE_ARM and place the code in the same functions as 
we did. You would then add a SPEEX_ASM_ARM 8  or something and let the 
application decide to turn it on.

<p>Other Notes: This patch obsoletes ltp_sse.h and filters_sse.h  . However 
the patch does not remove them. This is thge updated version of the patch 
we sent in November.

<p>Comments are welcome.  BTW, we have been shipping our Video Conferencing 
product which only uses the Speex codec for 6 months now and have gotten 
rave reviews (PC Magazine Editors choice) for the audio and video quality. 
We use Speex in Windows, Mac OS-X, and Linux as we have clients for each 
platform. Keep up the great work! Check us out at 
http://www.sightspeed.com  and please try our beta version (Mac and Windows 
Clients available now) at http://www.sightspeed.com/page.php?page=beta

Patch in .gz format attached, plain text below.


Aron Rosenberg
SightSpeed Inc.


<p><p>diff -ru speex-1.0.3-orig/configure.in speex-1.0.3/configure.in
--- speex-1.0.3-orig/configure.in       2003-10-30 08:57:25.000000000 -0800
+++ speex-1.0.3/configure.in    2004-01-08 20:44:25.000000000 -0800
@@ -18,11 +18,11 @@
  # For automake.



@@ -52,8 +52,18 @@
  AC_CHECK_LIB(gnugetopt, getopt_long)

+echo -n "Checking for Platform ASM Speedups..."
+case "$target" in
+    i?86*)
+        echo "Found SSE and SSE2"
+        ;;
+    *)
+        echo "None Found"
+        ;;

-AC_ARG_ENABLE(sse, [  --enable-sse             enable SSE support], [if 
test "$enableval" = yes; then AC_DEFINE(_USE_SSE) fi])

  dnl Output the makefiles and version.h.

diff -ru speex-1.0.3-orig/libspeex/filters.c speex-1.0.3/libspeex/filters.c
--- speex-1.0.3-orig/libspeex/filters.c 2003-01-05 21:56:56.000000000 -0800
+++ speex-1.0.3/libspeex/filters.c      2004-01-08 20:44:41.000000000 -0800
@@ -32,8 +32,10 @@

  #include "filters.h"
  #include "stack_alloc.h"
+#include "speex.h"
  #include <math.h>

+extern int global_use_mmx_sse;

  void bw_lpc(float gamma, float *lpc_in, float *lpc_out, int order)
@@ -46,41 +48,548 @@

-#ifdef _USE_SSE
-#include "filters_sse.h"
-void filter_mem2(float *x, float *num, float *den, float *y, int N, int 
ord, float *mem)
+void filter_mem2(float *x, float *_num, float *_den, float *y, int N, int 
ord, float *_mem)
-   int i,j;
-   float xi,yi;
-   for (i=0;i<N;i++)
-   {
-      xi=x[i];
-      y[i] = num[0]*xi + mem[0];
-      yi=y[i];
-      for (j=0;j<ord-1;j++)
-      {
-         mem[j] = mem[j+1] + num[j+1]*xi - den[j+1]*yi;
-      }
-      mem[ord-1] = num[ord]*xi - den[ord]*yi;
-   }
+       if (!(global_use_mmx_sse & SPEEX_ASM_MMX_SSE_FP))
+       {
+          int i,j;
+          float xi,yi;
+          for (i=0;i<N;i++)
+          {
+                 xi=x[i];
+                 y[i] = _num[0]*xi + _mem[0];
+                 yi=y[i];
+                 for (j=0;j<ord-1;j++)
+                 {
+                        _mem[j] = _mem[j+1] + _num[j+1]*xi - _den[j+1]*yi;
+                 }
+                 _mem[ord-1] = _num[ord]*xi - _den[ord]*yi;
+          }
+       }
+       else
+       {
+          float __num[20], __den[20], __mem[20];
+          float *num, *den, *mem;
+          int i;
+          num = (float*)(((int)(__num+4))&0xfffffff0)-1;
+          den = (float*)(((int)(__den+4))&0xfffffff0)-1;
+          mem = (float*)(((int)(__mem+4))&0xfffffff0)-1;
+          for (i=0;i<=10;i++)
+                 num[i]=den[i]=0;
+          for (i=0;i<10;i++)
+                 mem[i]=0;
+          for (i=0;i<ord+1;i++)
+          {
+                 num[i]=_num[i];
+                 den[i]=_den[i];
+          }
+          for (i=0;i<ord;i++)
+                 mem[i]=_mem[i];
+          for (i=0;i<N;i+=4)
+          {
+                  float *in1 = x+i;
+                  float *in2 = y+i;
+#ifdef WIN32
+                       /*
+                       Do we need to push???
+                       push eax
+                       push ebx
+                       push ecx
+                       push edx
+                       */
+                  _asm
+                       {
+                       mov eax, num
+                       mov ebx, den
+                       mov ecx, mem
+                       mov edx, in1
+                       movss xmm0, [edx]
+                       movss xmm1, [ecx]
+                       addss xmm1, xmm0
+                       mov edx, in2
+                       movss [edx], xmm1
+                       shufps xmm0, xmm0, 0x00
+                       shufps xmm1, xmm1, 0x00
+                       movaps xmm2, [eax+4]
+                       movaps xmm3, [ebx+4]
+                       mulps  xmm2, xmm0
+                       mulps  xmm3, xmm1
+                       movaps xmm4, [eax+20]
+                       mulps  xmm4, xmm0
+                       addps  xmm2, [ecx+4]
+                       movaps xmm5, [ebx+20]
+                       mulps  xmm5, xmm1
+                       addps  xmm4, [ecx+20]
+                       subps  xmm2, xmm3
+                       movups [ecx], xmm2
+                       subps  xmm4, xmm5
+                       movups [ecx+16], xmm4
+                       movss  xmm2, [eax+36]
+                       mulss  xmm2, xmm0
+                       movss  xmm3, [ebx+36]
+                       mulss  xmm3, xmm1
+                       addss  xmm2, [ecx+36]
+                       movss  xmm4, [eax+40]
+                       mulss  xmm4, xmm0
+                       movss  xmm5, [ebx+40]
+                       mulss  xmm5, xmm1
+                       subss  xmm2, xmm3
+                       movss  [ecx+32], xmm2
+                       subss  xmm4, xmm5
+                       movss  [ecx+36], xmm4
+                       mov edx, in1
+                       movss xmm0, [edx+4]
+                       movss xmm1, [ecx]
+                       addss xmm1, xmm0
+                       mov edx, in2
+                       movss [edx+4], xmm1
+                       shufps xmm0, xmm0, 0x00
+                       shufps xmm1, xmm1, 0x00
+                       movaps xmm2, [eax+4]
+                       movaps xmm3, [ebx+4]
+                       mulps  xmm2, xmm0
+                       mulps  xmm3, xmm1
+                       movaps xmm4, [eax+20]
+                       mulps  xmm4, xmm0
+                       addps  xmm2, [ecx+4]
+                       movaps xmm5, [ebx+20]
+                       mulps  xmm5, xmm1
+                       addps  xmm4, [ecx+20]
+                       subps  xmm2, xmm3
+                       movups [ecx], xmm2
+                       subps  xmm4, xmm5
+                       movups [ecx+16], xmm4
+                       movss  xmm2, [eax+36]
+                       mulss  xmm2, xmm0
+                       movss  xmm3, [ebx+36]
+                       mulss  xmm3, xmm1
+                       addss  xmm2, [ecx+36]
+                       movss  xmm4, [eax+40]
+                       mulss  xmm4, xmm0
+                       movss  xmm5, [ebx+40]
+                       mulss  xmm5, xmm1
+                       subss  xmm2, xmm3
+                       movss  [ecx+32], xmm2
+                       subss  xmm4, xmm5
+                       movss  [ecx+36], xmm4
+                       mov edx, in1
+                       movss xmm0, [edx+8]
+                       movss xmm1, [ecx]
+                       addss xmm1, xmm0
+                       mov edx, in2
+                       movss [edx+8], xmm1
+                       shufps xmm0, xmm0, 0x00
+                       shufps xmm1, xmm1, 0x00
+                       movaps xmm2, [eax+4]
+                       movaps xmm3, [ebx+4]
+                       mulps  xmm2, xmm0
+                       mulps  xmm3, xmm1
+                       movaps xmm4, [eax+20]
+                       mulps  xmm4, xmm0
+                       addps  xmm2, [ecx+4]
+                       movaps xmm5, [ebx+20]
+                       mulps  xmm5, xmm1
+                       addps  xmm4, [ecx+20]
+                       subps  xmm2, xmm3
+                       movups [ecx], xmm2
+                       subps  xmm4, xmm5
+                       movups [ecx+16], xmm4
+                       movss  xmm2, [eax+36]
+                       mulss  xmm2, xmm0
+                       movss  xmm3, [ebx+36]
+                       mulss  xmm3, xmm1
+                       addss  xmm2, [ecx+36]
+                       movss  xmm4, [eax+40]
+                       mulss  xmm4, xmm0
+                       movss  xmm5, [ebx+40]
+                       mulss  xmm5, xmm1
+                       subss  xmm2, xmm3
+                       movss  [ecx+32], xmm2
+                       subss  xmm4, xmm5
+                       movss  [ecx+36], xmm4
+                       mov edx, in1
+                       movss xmm0, [edx+12]
+                       movss xmm1, [ecx]
+                       addss xmm1, xmm0
+                       mov edx, in2
+                       movss [edx+12], xmm1
+                       shufps xmm0, xmm0, 0x00
+                       shufps xmm1, xmm1, 0x00
+                       movaps xmm2, [eax+4]
+                       movaps xmm3, [ebx+4]
+                       mulps  xmm2, xmm0
+                       mulps  xmm3, xmm1
+                       movaps xmm4, [eax+20]
+                       mulps  xmm4, xmm0
+                       addps  xmm2, [ecx+4]
+                       movaps xmm5, [ebx+20]
+                       mulps  xmm5, xmm1
+                       addps  xmm4, [ecx+20]
+                       subps  xmm2, xmm3
+                       movups [ecx], xmm2
+                       subps  xmm4, xmm5
+                       movups [ecx+16], xmm4
+                       movss  xmm2, [eax+36]
+                       mulss  xmm2, xmm0
+                       movss  xmm3, [ebx+36]
+                       mulss  xmm3, xmm1
+                       addss  xmm2, [ecx+36]
+                       movss  xmm4, [eax+40]
+                       mulss  xmm4, xmm0
+                       movss  xmm5, [ebx+40]
+                       mulss  xmm5, xmm1
+                       subss  xmm2, xmm3
+                       movss  [ecx+32], xmm2
+                       subss  xmm4, xmm5
+                       movss  [ecx+36], xmm4
+                  }
+                  /*
+                       pop edx
+                       pop ecx
+                       pop ebx
+                       pop eax
+                       */
+#if defined(GCC_COMPILER) && defined(_USE_SSE)
+                 __asm__ __volatile__
+                 (
+                  "\tmovss (%1), %%xmm0\n"
+                  "\tmovss (%0), %%xmm1\n"
+                  "\taddss %%xmm0, %%xmm1\n"
+                  "\tmovss %%xmm1, (%2)\n"
+                  "\tshufps $0x00, %%xmm0, %%xmm0\n"
+                  "\tshufps $0x00, %%xmm1, %%xmm1\n"
+                  "\tmovaps 4(%3),  %%xmm2\n"
+                  "\tmovaps 4(%4),  %%xmm3\n"
+                  "\tmulps  %%xmm0, %%xmm2\n"
+                  "\tmulps  %%xmm1, %%xmm3\n"
+                  "\tmovaps 20(%3), %%xmm4\n"
+                  "\tmulps  %%xmm0, %%xmm4\n"
+                  "\taddps  4(%0),  %%xmm2\n"
+                  "\tmovaps 20(%4), %%xmm5\n"
+                  "\tmulps  %%xmm1, %%xmm5\n"
+                  "\taddps  20(%0), %%xmm4\n"
+                  "\tsubps  %%xmm3, %%xmm2\n"
+                  "\tmovups %%xmm2, (%0)\n"
+                  "\tsubps  %%xmm5, %%xmm4\n"
+                  "\tmovups %%xmm4, 16(%0)\n"
+                  "\tmovss  36(%3), %%xmm2\n"
+                  "\tmulss  %%xmm0, %%xmm2\n"
+                  "\tmovss  36(%4), %%xmm3\n"
+                  "\tmulss  %%xmm1, %%xmm3\n"
+                  "\taddss  36(%0), %%xmm2\n"
+                  "\tmovss  40(%3), %%xmm4\n"
+                  "\tmulss  %%xmm0, %%xmm4\n"
+                  "\tmovss  40(%4), %%xmm5\n"
+                  "\tmulss  %%xmm1, %%xmm5\n"
+                  "\tsubss  %%xmm3, %%xmm2\n"
+                  "\tmovss  %%xmm2, 32(%0)\n"
+                  "\tsubss  %%xmm5, %%xmm4\n"
+                  "\tmovss  %%xmm4, 36(%0)\n"
+                  "\tmovss 4(%1), %%xmm0\n"
+                  "\tmovss (%0), %%xmm1\n"
+                  "\taddss %%xmm0, %%xmm1\n"
+                  "\tmovss %%xmm1, 4(%2)\n"
+                  "\tshufps $0x00, %%xmm0, %%xmm0\n"
+                  "\tshufps $0x00, %%xmm1, %%xmm1\n"
+                  "\tmovaps 4(%3),  %%xmm2\n"
+                  "\tmovaps 4(%4),  %%xmm3\n"
+                  "\tmulps  %%xmm0, %%xmm2\n"
+                  "\tmulps  %%xmm1, %%xmm3\n"
+                  "\tmovaps 20(%3), %%xmm4\n"
+                  "\tmulps  %%xmm0, %%xmm4\n"
+                  "\taddps  4(%0),  %%xmm2\n"
+                  "\tmovaps 20(%4), %%xmm5\n"
+                  "\tmulps  %%xmm1, %%xmm5\n"
+                  "\taddps  20(%0), %%xmm4\n"
+                  "\tsubps  %%xmm3, %%xmm2\n"
+                  "\tmovups %%xmm2, (%0)\n"
+                  "\tsubps  %%xmm5, %%xmm4\n"
+                  "\tmovups %%xmm4, 16(%0)\n"
+                  "\tmovss  36(%3), %%xmm2\n"
+                  "\tmulss  %%xmm0, %%xmm2\n"
+                  "\tmovss  36(%4), %%xmm3\n"
+                  "\tmulss  %%xmm1, %%xmm3\n"
+                  "\taddss  36(%0), %%xmm2\n"
+                  "\tmovss  40(%3), %%xmm4\n"
+                  "\tmulss  %%xmm0, %%xmm4\n"
+                  "\tmovss  40(%4), %%xmm5\n"
+                  "\tmulss  %%xmm1, %%xmm5\n"
+                  "\tsubss  %%xmm3, %%xmm2\n"
+                  "\tmovss  %%xmm2, 32(%0)\n"
+                  "\tsubss  %%xmm5, %%xmm4\n"
+                  "\tmovss  %%xmm4, 36(%0)\n"
+                  "\tmovss 8(%1), %%xmm0\n"
+                  "\tmovss (%0), %%xmm1\n"
+                  "\taddss %%xmm0, %%xmm1\n"
+                  "\tmovss %%xmm1, 8(%2)\n"
+                  "\tshufps $0x00, %%xmm0, %%xmm0\n"
+                  "\tshufps $0x00, %%xmm1, %%xmm1\n"
+                  "\tmovaps 4(%3),  %%xmm2\n"
+                  "\tmovaps 4(%4),  %%xmm3\n"
+                  "\tmulps  %%xmm0, %%xmm2\n"
+                  "\tmulps  %%xmm1, %%xmm3\n"
+                  "\tmovaps 20(%3), %%xmm4\n"
+                  "\tmulps  %%xmm0, %%xmm4\n"
+                  "\taddps  4(%0),  %%xmm2\n"
+                  "\tmovaps 20(%4), %%xmm5\n"
+                  "\tmulps  %%xmm1, %%xmm5\n"
+                  "\taddps  20(%0), %%xmm4\n"
+                  "\tsubps  %%xmm3, %%xmm2\n"
+                  "\tmovups %%xmm2, (%0)\n"
+                  "\tsubps  %%xmm5, %%xmm4\n"
+                  "\tmovups %%xmm4, 16(%0)\n"
+                  "\tmovss  36(%3), %%xmm2\n"
+                  "\tmulss  %%xmm0, %%xmm2\n"
+                  "\tmovss  36(%4), %%xmm3\n"
+                  "\tmulss  %%xmm1, %%xmm3\n"
+                  "\taddss  36(%0), %%xmm2\n"
+                  "\tmovss  40(%3), %%xmm4\n"
+                  "\tmulss  %%xmm0, %%xmm4\n"
+                  "\tmovss  40(%4), %%xmm5\n"
+                  "\tmulss  %%xmm1, %%xmm5\n"
+                  "\tsubss  %%xmm3, %%xmm2\n"
+                  "\tmovss  %%xmm2, 32(%0)\n"
+                  "\tsubss  %%xmm5, %%xmm4\n"
+                  "\tmovss  %%xmm4, 36(%0)\n"
+                  "\tmovss 12(%1), %%xmm0\n"
+                  "\tmovss (%0), %%xmm1\n"
+                  "\taddss %%xmm0, %%xmm1\n"
+                  "\tmovss %%xmm1, 12(%2)\n"
+                  "\tshufps $0x00, %%xmm0, %%xmm0\n"
+                  "\tshufps $0x00, %%xmm1, %%xmm1\n"
+                  "\tmovaps 4(%3),  %%xmm2\n"
+                  "\tmovaps 4(%4),  %%xmm3\n"
+                  "\tmulps  %%xmm0, %%xmm2\n"
+                  "\tmulps  %%xmm1, %%xmm3\n"
+                  "\tmovaps 20(%3), %%xmm4\n"
+                  "\tmulps  %%xmm0, %%xmm4\n"
+                  "\taddps  4(%0),  %%xmm2\n"
+                  "\tmovaps 20(%4), %%xmm5\n"
+                  "\tmulps  %%xmm1, %%xmm5\n"
+                  "\taddps  20(%0), %%xmm4\n"
+                  "\tsubps  %%xmm3, %%xmm2\n"
+                  "\tmovups %%xmm2, (%0)\n"
+                  "\tsubps  %%xmm5, %%xmm4\n"
+                  "\tmovups %%xmm4, 16(%0)\n"
+                  "\tmovss  36(%3), %%xmm2\n"
+                  "\tmulss  %%xmm0, %%xmm2\n"
+                  "\tmovss  36(%4), %%xmm3\n"
+                  "\tmulss  %%xmm1, %%xmm3\n"
+                  "\taddss  36(%0), %%xmm2\n"
+                  "\tmovss  40(%3), %%xmm4\n"
+                  "\tmulss  %%xmm0, %%xmm4\n"
+                  "\tmovss  40(%4), %%xmm5\n"
+                  "\tmulss  %%xmm1, %%xmm5\n"
+                  "\tsubss  %%xmm3, %%xmm2\n"
+                  "\tmovss  %%xmm2, 32(%0)\n"
+                  "\tsubss  %%xmm5, %%xmm4\n"
+                  "\tmovss  %%xmm4, 36(%0)\n"
+                  : : "r" (mem), "r" (x+i), "r" (y+i), "r" (num), "r" (den)
+                  : "memory" );
+               }
+               for (i=0;i<ord;i++)
+                       _mem[i]=mem[i];
+       }

<p>-void iir_mem2(float *x, float *den, float *y, int N, int ord, float *mem)
+void iir_mem2(float *x, float *_den, float *y, int N, int ord, float *_mem)
-   int i,j;
-   for (i=0;i<N;i++)
-   {
-      y[i] = x[i] + mem[0];
-      for (j=0;j<ord-1;j++)
-      {
-         mem[j] = mem[j+1] - den[j+1]*y[i];
-      }
-      mem[ord-1] = - den[ord]*y[i];
-   }
+       if (!(global_use_mmx_sse & SPEEX_ASM_MMX_SSE_FP))
+       {
+          int i,j;
+          for (i=0;i<N;i++)
+          {
+                 y[i] = x[i] + _mem[0];
+                 for (j=0;j<ord-1;j++)
+                 {
+                        _mem[j] = _mem[j+1] - _den[j+1]*y[i];
+                 }
+                 _mem[ord-1] = - _den[ord]*y[i];
+          }
+       }
+       else
+       {
+               float  __den[20], __mem[20];
+               float *den, *mem;
+               int i;
+               den = (float*)(((int)(__den+4))&0xfffffff0)-1;
+               mem = (float*)(((int)(__mem+4))&0xfffffff0)-1;
+               for (i=0;i<=10;i++)
+               {
+                       den[i]=0;
+               }
+               for (i=0;i<10;i++)
+               {
+                       mem[i]=0;
+               }
+               for (i=0;i<ord+1;i++)
+               {
+                       den[i]=_den[i];
+               }
+               for (i=0;i<ord;i++)
+               {
+                       mem[i]=_mem[i];
+               }
+               for (i=0;i<N;i++)
+               {
+/* The assembler block below is equivalent to this code
+                       y[i] = x[i] + mem[0];
+                       for (j=0;j<ord-1;j++)
+                       {
+                               mem[j] = mem[j+1] - den[j+1]*y[i];
+                       }
+                       mem[ord-1] = - den[ord]*y[i]
+                       float *in1 = x+i;
+                       float *in2 = y+i;
+#ifdef WIN32
+                       /*
+                       Do we need to push???
+                       push eax
+                       push ebx
+                       push ecx
+                       push edx
+                       */
+                       _asm
+                       {
+                       mov eax, den
+                       mov ebx, mem
+                       mov ecx, in1
+                       mov edx, in2
+                       movss xmm0, [ecx]
+                       movss xmm1, [ebx]
+                       addss xmm1, xmm0
+                       movss [edx], xmm1
+                       shufps xmm0, xmm0, 0x00
+                       shufps xmm1, xmm1, 0x00
+                       movaps xmm2, [eax+4]
+                       movaps xmm3, [eax+20]
+                       mulps  xmm2, xmm1
+                       mulps  xmm3, xmm1
+                       movss  xmm4, [eax+36]
+                       movss  xmm5, [eax+40]
+                       mulss  xmm4, xmm1
+                       mulss  xmm5, xmm1
+                       movaps xmm6, [ebx+4]
+                       subps  xmm6, xmm2
+                       movups [ebx], xmm6
+                       movaps xmm7, [ebx+20]
+                       subps  xmm7, xmm3
+                       movups [ebx+16], xmm7
+                       movss  xmm7, [ebx+36]
+                       subss  xmm7, xmm4
+                       movss  [ebx+32], xmm7
+                       xorps  xmm2, xmm2
+                       subss  xmm2, xmm5
+                       movss  [ebx+36], xmm2
+                       }
+                  /*
+                       pop edx
+                       pop ecx
+                       pop ebx
+                       pop eax
+                       */
+#if defined(GCC_COMPILER) && defined(_USE_SSE)
+                       __asm__ __volatile__ (
+                       "\tmovss (%1), %%xmm0\n"
+                       "\tmovss (%0), %%xmm1\n"
+                       "\taddss %%xmm0, %%xmm1\n"
+                       "\tmovss %%xmm1, (%2)\n"
+                       "\tshufps $0x00, %%xmm0, %%xmm0\n"
+                       "\tshufps $0x00, %%xmm1, %%xmm1\n"
+                       "\tmovaps 4(%3),  %%xmm2\n"
+                       "\tmovaps 20(%3), %%xmm3\n"
+                       "\tmulps  %%xmm1, %%xmm2\n"
+                       "\tmulps  %%xmm1, %%xmm3\n"
+                       "\tmovss  36(%3), %%xmm4\n"
+                       "\tmovss  40(%3), %%xmm5\n"
+                       "\tmulss  %%xmm1, %%xmm4\n"
+                       "\tmulss  %%xmm1, %%xmm5\n"
+                       "\tmovaps 4(%0),  %%xmm6\n"
+                       "\tsubps  %%xmm2, %%xmm6\n"
+                       "\tmovups %%xmm6, (%0)\n"
+                       "\tmovaps 20(%0), %%xmm7\n"
+                       "\tsubps  %%xmm3, %%xmm7\n"
+                       "\tmovups %%xmm7, 16(%0)\n"
+                       "\tmovss  36(%0), %%xmm7\n"
+                       "\tsubss  %%xmm4, %%xmm7\n"
+                       "\tmovss  %%xmm7, 32(%0)\n"
+                       "\txorps  %%xmm2, %%xmm2\n"
+                       "\tsubss  %%xmm5, %%xmm2\n"
+                       "\tmovss  %%xmm2, 36(%0)\n"
+                       : : "r" (mem), "r" (x+i), "r" (y+i), "r" (den) : 
"memory" );
+               }
+               for (i=0;i<ord;i++)
+               {
+                       _mem[i]=mem[i];
+               }
+       }

  void fir_mem2(float *x, float *num, float *y, int N, int ord, float *mem)
diff -ru speex-1.0.3-orig/libspeex/ltp.c speex-1.0.3/libspeex/ltp.c
--- speex-1.0.3-orig/libspeex/ltp.c     2003-05-05 18:20:26.000000000 -0700
+++ speex-1.0.3/libspeex/ltp.c  2004-01-08 20:44:45.000000000 -0800
@@ -34,37 +34,168 @@
  #include "ltp.h"
  #include "stack_alloc.h"
  #include "filters.h"
+#include "speex.h"
  #include "speex_bits.h"

-#ifdef _USE_SSE
-#include "ltp_sse.h"
-static float inner_prod(float *x, float *y, int len)
+extern int global_use_mmx_sse;
+static float inner_prod(float *a, float *b, int len)
-   int i;
-   float sum1=0,sum2=0,sum3=0,sum4=0;
-   for (i=0;i<len;)
-   {
-      sum1 += x[i]*y[i];
-      sum2 += x[i+1]*y[i+1];
-      sum3 += x[i+2]*y[i+2];
-      sum4 += x[i+3]*y[i+3];
-      i+=4;
-   }
-   return sum1+sum2+sum3+sum4;
+       if (!(global_use_mmx_sse & SPEEX_ASM_MMX_SSE_FP))
+       {
+//Older, slower version
+               int i;
+               float sum=0;
+               for (i=0;i<len;i++)
+                       sum += a[i]*b[i];
+               return sum;
+*/             int i;
+               float sum1=0,sum2=0,sum3=0,sum4=0;
+               for (i=0;i<len;)
+               {
+                       sum1 += a[i]*b[i];
+                       sum2 += a[i+1]*b[i+1];
+                       sum3 += a[i+2]*b[i+2];
+                       sum4 += a[i+3]*b[i+3];
+                       i+=4;
+               }
+               return sum1+sum2+sum3+sum4;
+       }
+       else
+       {
+               float sum;
+               float* sum1 = &sum;
+#ifdef WIN32
+               _asm
+               {
+                 push eax
+                 push ebx
+                 push ecx
+                 push edx
+                 push edi
+                 mov eax, a
+                 mov edi, b
+                 mov ecx, len
+                 mov edx, sum1
+                 xorps xmm3, xmm3
+                 xorps xmm4, xmm4
+                 sub ecx, 20
+               mul20_loop:
+                 movups xmm0, [eax]
+                 movups xmm1, [edi]
+                 mulps xmm1, xmm0
+                 movups xmm5, [eax+16]
+                 movups xmm6, [edi+16]
+                 mulps xmm6, xmm5
+                 addps xmm3, xmm1
+                 movups xmm0, [eax+32]
+                 movups xmm1, [edi+32]
+                 mulps xmm1, xmm0
+                 addps xmm4, xmm6
+                 movups xmm5, [eax+48]
+                 movups xmm6, [edi+48]
+                 mulps xmm6, xmm5
+                 addps xmm3, xmm1
+                 movups xmm0, [eax+64]
+                 movups xmm1, [edi+64]
+                 mulps xmm1, xmm0
+                 addps xmm4, xmm6
+                 addps xmm3, xmm1
+                 add eax, 80
+                 add edi, 80
+                 sub ecx, 20
+                 jae mul20_loop
+                 addps xmm3, xmm4
+                 movhlps xmm4, xmm3
+                 addps xmm3, xmm4
+                 movaps xmm4, xmm3
+                 shufps xmm4, xmm4, 0x55
+                 addss xmm3, xmm4
+                 movss [edx], xmm3
+                 pop edi
+                 pop edx
+                 pop ecx
+                 pop ebx
+                 pop eax
+                 }
+#if defined(GCC_COMPILER) && defined(_USE_SSE)
+                 __asm__ __volatile__ (
+                 "\tpush %%eax\n"
+                 "\tpush %%edi\n"
+                 "\tpush %%ecx\n"
+                 "\txorps %%xmm3, %%xmm3\n"
+                 "\txorps %%xmm4, %%xmm4\n"
+                 "\tsub $20, %%ecx\n"
+               ".mul20_loop%=:\n"
+                 "\tmovups (%%eax), %%xmm0\n"
+                 "\tmovups (%%edi), %%xmm1\n"
+                 "\tmulps %%xmm0, %%xmm1\n"
+                 "\tmovups 16(%%eax), %%xmm5\n"
+                 "\tmovups 16(%%edi), %%xmm6\n"
+                 "\tmulps %%xmm5, %%xmm6\n"
+                 "\taddps %%xmm1, %%xmm3\n"
+                 "\tmovups 32(%%eax), %%xmm0\n"
+                 "\tmovups 32(%%edi), %%xmm1\n"
+                 "\tmulps %%xmm0, %%xmm1\n"
+                 "\taddps %%xmm6, %%xmm4\n"
+                 "\tmovups 48(%%eax), %%xmm5\n"
+                 "\tmovups 48(%%edi), %%xmm6\n"
+                 "\tmulps %%xmm5, %%xmm6\n"
+                 "\taddps %%xmm1, %%xmm3\n"
+                 "\tmovups 64(%%eax), %%xmm0\n"
+                 "\tmovups 64(%%edi), %%xmm1\n"
+                 "\tmulps %%xmm0, %%xmm1\n"
+                 "\taddps %%xmm6, %%xmm4\n"
+                 "\taddps %%xmm1, %%xmm3\n"
+                 "\tadd $80, %%eax\n"
+                 "\tadd $80, %%edi\n"
+                 "\tsub $20,  %%ecx\n"
+                 "\tjae .mul20_loop%=\n"
+                 "\taddps %%xmm4, %%xmm3\n"
+                 "\tmovhlps %%xmm3, %%xmm4\n"
+                 "\taddps %%xmm4, %%xmm3\n"
+                 "\tmovaps %%xmm3, %%xmm4\n"
+                 "\tshufps $0x55, %%xmm4, %%xmm4\n"
+                 "\taddss %%xmm4, %%xmm3\n"
+                 "\tmovss %%xmm3, (%%edx)\n"
+                 "\tpop %%ecx\n"
+                 "\tpop %%edi\n"
+                 "\tpop %%eax\n"
+                 : : "a" (a), "D" (b), "c" (len), "d" (&sum) : "memory");
+               return sum;
+       }

-/*Original, non-optimized version*/
-/*static float inner_prod(float *x, float *y, int len)
-   int i;
-   float sum=0;
-   for (i=0;i<len;i++)
-      sum += x[i]*y[i];
-   return sum;

<p>  void open_loop_nbest_pitch(float *sw, int start, int end, int len, int 
*pitch, float *gain, int N, char *stack)
diff -ru speex-1.0.3-orig/libspeex/modes.c speex-1.0.3/libspeex/modes.c
--- speex-1.0.3-orig/libspeex/modes.c   2003-06-02 22:29:39.000000000 -0700
+++ speex-1.0.3/libspeex/modes.c        2004-01-08 20:46:26.000000000 -0800
@@ -45,6 +45,7 @@
  #define NULL 0

+int global_use_mmx_sse = 0;
  SpeexMode *speex_mode_list[SPEEX_NB_MODES] = {&speex_nb_mode, 
&speex_wb_mode, &speex_uwb_mode};

  /* Extern declarations for all codebooks we use here */
@@ -585,16 +586,27 @@

  int speex_encoder_ctl(void *state, int request, void *ptr)
-   return (*((SpeexMode**)state))->enc_ctl(state, request, ptr);
+       if (request == SPEEX_SET_ASM_FLAG)
+       {
+               global_use_mmx_sse = *((int*)ptr);
+               return 0;
+       }
+       else
+               return (*((SpeexMode**)state))->enc_ctl(state, request, ptr);

  int speex_decoder_ctl(void *state, int request, void *ptr)
-   return (*((SpeexMode**)state))->dec_ctl(state, request, ptr);
+       if (request == SPEEX_SET_ASM_FLAG)
+       {
+               global_use_mmx_sse = *((int*)ptr);
+               return 0;
+       }
+       else
+          return (*((SpeexMode**)state))->dec_ctl(state, request, ptr);

  static int nb_mode_query(void *mode, int request, void *ptr)
     SpeexNBMode *m = (SpeexNBMode*)mode;
diff -ru speex-1.0.3-orig/libspeex/speex.h speex-1.0.3/libspeex/speex.h
--- speex-1.0.3-orig/libspeex/speex.h   2003-01-22 23:29:39.000000000 -0800
+++ speex-1.0.3/libspeex/speex.h        2004-01-08 20:46:21.000000000 -0800
@@ -156,6 +156,13 @@
  /** Number of defined modes in Speex */
  #define SPEEX_NB_MODES 3

+#define SPEEX_SET_ASM_FLAG             200
+#define SPEEX_ASM_MMX_NONE             0
+#define SPEEX_ASM_MMX_BASIC            1
+#define SPEEX_ASM_MMX_SSE              2
+#define SPEEX_ASM_MMX_SSE_FP   4
  struct SpeexMode;

-------------- next part --------------
A non-text attachment was scrubbed...
Name: speex-sse-patch.gz
Type: application/octet-stream
Size: 3886 bytes
Desc: speex-sse-patch.gz
Url : http://lists.xiph.org/pipermail/speex-dev/attachments/20040108/99cf64fa/speex-sse-patch.obj

More information about the Speex-dev mailing list