[xiph-cvs] cvs commit: speex/libspeex filters_sse.h ltp_sse.h Makefile.am filters.c ltp.c

Fri Oct 25 21:37:15 PDT 2002

jm          02/10/26 00:37:15

  Modified:    libspeex Makefile.am filters.c ltp.c
  Added:       libspeex filters_sse.h ltp_sse.h
  Log:
  Added SSE support (gcc only) by defining _USE_SSE

Revision  Changes    Path
1.41      +4 -2      speex/libspeex/Makefile.am

Index: Makefile.am
===================================================================
RCS file: /usr/local/cvsroot/speex/libspeex/Makefile.am,v
retrieving revision 1.40
retrieving revision 1.41
diff -u -r1.40 -r1.41

--- Makefile.am	2 Oct 2002 19:52:52 -0000	1.40
+++ Makefile.am	26 Oct 2002 04:37:15 -0000	1.41
@@ -1,6 +1,6 @@
 ## Process this file with automake to produce Makefile.in. -*-Makefile-*-
 
-# $Id: Makefile.am,v 1.40 2002/10/02 19:52:52 jmvalin Exp $
+# $Id: Makefile.am,v 1.41 2002/10/26 04:37:15 jm Exp $
 
 # Disable automatic dependency tracking if using other tools than gcc and gmake
 #AUTOMAKE_OPTIONS = no-dependencies
@@ -53,7 +53,9 @@
         modes.h \
         sb_celp.h \
         vbr.h \
-	misc.h
+	misc.h \
+	ltp_sse.h \
+	filters_sse.h
         
         
 libspeex_la_LDFLAGS = -release $(LT_RELEASE)

<p><p>1.27      +15 -14    speex/libspeex/filters.c

Index: filters.c
===================================================================
RCS file: /usr/local/cvsroot/speex/libspeex/filters.c,v
retrieving revision 1.26
retrieving revision 1.27
diff -u -r1.26 -r1.27
--- filters.c	26 Oct 2002 02:58:36 -0000	1.26
+++ filters.c	26 Oct 2002 04:37:15 -0000	1.27
@@ -55,7 +55,9 @@
    }
 }
 
-
+#ifdef _USE_SSE
+#include "filters_sse.h"
+#else
 void filter_mem2(float *x, float *num, float *den, float *y, int N, int ord, float *mem)
 {
    int i,j;
@@ -74,38 +76,37 @@
 }
 
 
-void fir_mem2(float *x, float *num, float *y, int N, int ord, float *mem)
+void iir_mem2(float *x, float *den, float *y, int N, int ord, float *mem)
 {
    int i,j;
-   float xi;
    for (i=0;i<N;i++)
    {
-      xi=x[i];
-      y[i] = num[0]*xi + mem[0];
+      y[i] = x[i] + mem[0];
       for (j=0;j<ord-1;j++)
       {
-         mem[j] = mem[j+1] + num[j+1]*xi;
+         mem[j] = mem[j+1] - den[j+1]*y[i];
       }
-      mem[ord-1] = num[ord]*xi;
+      mem[ord-1] = - den[ord]*y[i];
    }
 }
+#endif
 
-void iir_mem2(float *x, float *den, float *y, int N, int ord, float *mem)
+void fir_mem2(float *x, float *num, float *y, int N, int ord, float *mem)
 {
    int i,j;
+   float xi;
    for (i=0;i<N;i++)
    {
-      y[i] = x[i] + mem[0];
+      xi=x[i];
+      y[i] = num[0]*xi + mem[0];
       for (j=0;j<ord-1;j++)
       {
-         mem[j] = mem[j+1] - den[j+1]*y[i];
+         mem[j] = mem[j+1] + num[j+1]*xi;
       }
-      mem[ord-1] = - den[ord]*y[i];
+      mem[ord-1] = num[ord]*xi;
    }
 }
 
-
-
 void syn_percep_zero(float *xx, float *ak, float *awk1, float *awk2, float *y, int N, int ord, float *stack)
 {
    int i;
@@ -164,7 +165,7 @@
      mem[i]=xx[N-i-1];
 }
 
-
+/* By segher */
 void fir_mem_up(float *x, float *a, float *y, int N, int M, float *mem)
    /* assumptions:
       all odd x[i] are zero -- well, actually they are left out of the array now

<p><p>1.61      +4 -1      speex/libspeex/ltp.c

Index: ltp.c
===================================================================
RCS file: /usr/local/cvsroot/speex/libspeex/ltp.c,v
retrieving revision 1.60
retrieving revision 1.61
diff -u -r1.60 -r1.61
--- ltp.c	24 Oct 2002 06:29:13 -0000	1.60
+++ ltp.c	26 Oct 2002 04:37:15 -0000	1.61
@@ -37,7 +37,9 @@
 #include "filters.h"
 #include "speex_bits.h"
 
-
+#ifdef _USE_SSE
+#include "ltp_sse.h"
+#else
 static float inner_prod(float *x, float *y, int len)
 {
    int i;
@@ -52,6 +54,7 @@
    }
    return sum1+sum2+sum3+sum4;
 }
+#endif
 
 /*Original, non-optimized version*/
 /*static float inner_prod(float *x, float *y, int len)

<p><p>1.1                  speex/libspeex/filters_sse.h

Index: filters_sse.h
===================================================================
/* Copyright (C) 2002 Jean-Marc Valin 
   File: filters.c
   Various analysis/synthesis filters

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   
   - Neither the name of the Xiph.org Foundation nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior written permission.
   
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

void filter_mem2(float *x, float *_num, float *_den, float *y, int N, int ord, float *_mem)
{
   float __num[20], __den[20], __mem[20];
   float *num, *den, *mem;
   int i,j;
   float xi,yi;

   num = (float*)(((int)(__num+4))&0xfffffff0)-1;
   den = (float*)(((int)(__den+4))&0xfffffff0)-1;
   mem = (float*)(((int)(__mem+4))&0xfffffff0)-1;
   for (i=0;i<ord+1;i++)
   {
      num[i]=_num[i];
      den[i]=_den[i];
   }
   for (i=0;i<ord;i++)
      mem[i]=_mem[i];
   for (i=0;i<N;i+=4)
   {

      __asm__ __volatile__ 
      ("
       movss (%1), %%xmm0
       movss (%0), %%xmm1
       addss %%xmm0, %%xmm1
       movss %%xmm1, (%2)
       shufps $0x00, %%xmm0, %%xmm0
       shufps $0x00, %%xmm1, %%xmm1

       movaps 4(%3),  %%xmm2
       movaps 4(%4),  %%xmm3
       mulps  %%xmm0, %%xmm2
       mulps  %%xmm1, %%xmm3
       movaps 20(%3), %%xmm4
       mulps  %%xmm0, %%xmm4
       addps  4(%0),  %%xmm2
       movaps 20(%4), %%xmm5
       mulps  %%xmm1, %%xmm5
       addps  20(%0), %%xmm4
       subps  %%xmm3, %%xmm2
       movups %%xmm2, (%0)
       subps  %%xmm5, %%xmm4
       movups %%xmm4, 16(%0)

       movss  36(%3), %%xmm2
       mulss  %%xmm0, %%xmm2
       movss  36(%4), %%xmm3
       mulss  %%xmm1, %%xmm3
       addss  36(%0), %%xmm2
       movss  40(%3), %%xmm4
       mulss  %%xmm0, %%xmm4
       movss  40(%4), %%xmm5
       mulss  %%xmm1, %%xmm5
       subss  %%xmm3, %%xmm2
       movss  %%xmm2, 32(%0)       
       subss  %%xmm5, %%xmm4
       movss  %%xmm4, 36(%0)

<p><p>       movss 4(%1), %%xmm0
       movss (%0), %%xmm1
       addss %%xmm0, %%xmm1
       movss %%xmm1, 4(%2)
       shufps $0x00, %%xmm0, %%xmm0
       shufps $0x00, %%xmm1, %%xmm1

       movaps 4(%3),  %%xmm2
       movaps 4(%4),  %%xmm3
       mulps  %%xmm0, %%xmm2
       mulps  %%xmm1, %%xmm3
       movaps 20(%3), %%xmm4
       mulps  %%xmm0, %%xmm4
       addps  4(%0),  %%xmm2
       movaps 20(%4), %%xmm5
       mulps  %%xmm1, %%xmm5
       addps  20(%0), %%xmm4
       subps  %%xmm3, %%xmm2
       movups %%xmm2, (%0)
       subps  %%xmm5, %%xmm4
       movups %%xmm4, 16(%0)

       movss  36(%3), %%xmm2
       mulss  %%xmm0, %%xmm2
       movss  36(%4), %%xmm3
       mulss  %%xmm1, %%xmm3
       addss  36(%0), %%xmm2
       movss  40(%3), %%xmm4
       mulss  %%xmm0, %%xmm4
       movss  40(%4), %%xmm5
       mulss  %%xmm1, %%xmm5
       subss  %%xmm3, %%xmm2
       movss  %%xmm2, 32(%0)       
       subss  %%xmm5, %%xmm4
       movss  %%xmm4, 36(%0)

<p><p>       movss 8(%1), %%xmm0
       movss (%0), %%xmm1
       addss %%xmm0, %%xmm1
       movss %%xmm1, 8(%2)
       shufps $0x00, %%xmm0, %%xmm0
       shufps $0x00, %%xmm1, %%xmm1

       movaps 4(%3),  %%xmm2
       movaps 4(%4),  %%xmm3
       mulps  %%xmm0, %%xmm2
       mulps  %%xmm1, %%xmm3
       movaps 20(%3), %%xmm4
       mulps  %%xmm0, %%xmm4
       addps  4(%0),  %%xmm2
       movaps 20(%4), %%xmm5
       mulps  %%xmm1, %%xmm5
       addps  20(%0), %%xmm4
       subps  %%xmm3, %%xmm2
       movups %%xmm2, (%0)
       subps  %%xmm5, %%xmm4
       movups %%xmm4, 16(%0)

       movss  36(%3), %%xmm2
       mulss  %%xmm0, %%xmm2
       movss  36(%4), %%xmm3
       mulss  %%xmm1, %%xmm3
       addss  36(%0), %%xmm2
       movss  40(%3), %%xmm4
       mulss  %%xmm0, %%xmm4
       movss  40(%4), %%xmm5
       mulss  %%xmm1, %%xmm5
       subss  %%xmm3, %%xmm2
       movss  %%xmm2, 32(%0)       
       subss  %%xmm5, %%xmm4
       movss  %%xmm4, 36(%0)

<p><p>       movss 12(%1), %%xmm0
       movss (%0), %%xmm1
       addss %%xmm0, %%xmm1
       movss %%xmm1, 12(%2)
       shufps $0x00, %%xmm0, %%xmm0
       shufps $0x00, %%xmm1, %%xmm1

       movaps 4(%3),  %%xmm2
       movaps 4(%4),  %%xmm3
       mulps  %%xmm0, %%xmm2
       mulps  %%xmm1, %%xmm3
       movaps 20(%3), %%xmm4
       mulps  %%xmm0, %%xmm4
       addps  4(%0),  %%xmm2
       movaps 20(%4), %%xmm5
       mulps  %%xmm1, %%xmm5
       addps  20(%0), %%xmm4
       subps  %%xmm3, %%xmm2
       movups %%xmm2, (%0)
       subps  %%xmm5, %%xmm4
       movups %%xmm4, 16(%0)

       movss  36(%3), %%xmm2
       mulss  %%xmm0, %%xmm2
       movss  36(%4), %%xmm3
       mulss  %%xmm1, %%xmm3
       addss  36(%0), %%xmm2
       movss  40(%3), %%xmm4
       mulss  %%xmm0, %%xmm4
       movss  40(%4), %%xmm5
       mulss  %%xmm1, %%xmm5
       subss  %%xmm3, %%xmm2
       movss  %%xmm2, 32(%0)       
       subss  %%xmm5, %%xmm4
       movss  %%xmm4, 36(%0)

       "
       : : "r" (mem), "r" (x+i), "r" (y+i), "r" (num), "r" (den)
       : "memory" );

   }
   for (i=0;i<ord;i++)
      _mem[i]=mem[i];

}

<p>void iir_mem2(float *x, float *_den, float *y, int N, int ord, float *_mem)
{
   float  __den[20], __mem[20];
   float *num, *den, *mem;
   int i,j;

   den = (float*)(((int)(__den+4))&0xfffffff0)-1;
   mem = (float*)(((int)(__mem+4))&0xfffffff0)-1;
   for (i=0;i<ord+1;i++)
   {
      den[i]=_den[i];
   }
   for (i=0;i<ord;i++)
      mem[i]=_mem[i];

   for (i=0;i<N;i++)
   {
#if 0
      y[i] = x[i] + mem[0];
      for (j=0;j<ord-1;j++)
      {
         mem[j] = mem[j+1] - den[j+1]*y[i];
      }
      mem[ord-1] = - den[ord]*y[i];
#else
      __asm__ __volatile__ 
      ("
       movss (%1), %%xmm0
       movss (%0), %%xmm1
       addss %%xmm0, %%xmm1
       movss %%xmm1, (%2)
       shufps $0x00, %%xmm0, %%xmm0
       shufps $0x00, %%xmm1, %%xmm1

       
       movaps 4(%3),  %%xmm2
       movaps 20(%3), %%xmm3
       mulps  %%xmm1, %%xmm2
       mulps  %%xmm1, %%xmm3
       movss  36(%3), %%xmm4
       movss  40(%3), %%xmm5
       mulss  %%xmm1, %%xmm4
       mulss  %%xmm1, %%xmm5
       movaps 4(%0),  %%xmm6
       subps  %%xmm2, %%xmm6
       movups %%xmm6, (%0)
       movaps 20(%0), %%xmm7
       subps  %%xmm3, %%xmm7
       movups %%xmm7, 16(%0)

<p>       movss  36(%0), %%xmm7
       subss  %%xmm4, %%xmm7
       movss  %%xmm7, 32(%0)       
       xorps  %%xmm2, %%xmm2
       subss  %%xmm5, %%xmm2
       movss  %%xmm2, 36(%0)

       "
       : : "r" (mem), "r" (x+i), "r" (y+i), "r" (den)
       : "memory" );
#endif
   }
   for (i=0;i<ord;i++)
      _mem[i]=mem[i];

}

<p><p><p>1.1                  speex/libspeex/ltp_sse.h

Index: ltp_sse.h
===================================================================
/* Copyright (C) 2002 Jean-Marc Valin 
   File: ltp.c
   Lont-Term Prediction functions

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   
   - Neither the name of the Xiph.org Foundation nor the names of its
   contributors may be used to endorse or promote products derived from
   this software without specific prior written permission.
   
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

<p>static float inner_prod(float *a, float *b, int len)
{
  float sum;
  __asm__ __volatile__ (
  "
  push %%eax
  push %%edi
  push %%ecx
  xorps %%xmm3, %%xmm3
  xorps %%xmm4, %%xmm4

  sub $20, %%ecx

.mul20_loop%=:

  movups (%%eax), %%xmm0
  movups (%%edi), %%xmm1
  mulps %%xmm0, %%xmm1

  movups 16(%%eax), %%xmm5
  movups 16(%%edi), %%xmm6
  mulps %%xmm5, %%xmm6
  addps %%xmm1, %%xmm3

  movups 32(%%eax), %%xmm0
  movups 32(%%edi), %%xmm1
  mulps %%xmm0, %%xmm1
  addps %%xmm6, %%xmm4

  movups 48(%%eax), %%xmm5
  movups 48(%%edi), %%xmm6
  mulps %%xmm5, %%xmm6
  addps %%xmm1, %%xmm3

  movups 64(%%eax), %%xmm0
  movups 64(%%edi), %%xmm1
  mulps %%xmm0, %%xmm1
  addps %%xmm6, %%xmm4
  addps %%xmm1, %%xmm3

<p>  add $80, %%eax
  add $80, %%edi

  sub $20,  %%ecx

  jae .mul20_loop%=

  addps %%xmm4, %%xmm3

  movhlps %%xmm3, %%xmm4
  addps %%xmm4, %%xmm3
  movaps %%xmm3, %%xmm4
  //FIXME: which one?
  shufps $0x55, %%xmm4, %%xmm4
  //shufps $33, %%xmm4, %%xmm4
  addss %%xmm4, %%xmm3
  movss %%xmm3, (%%edx)
  
  pop %%ecx
  pop %%edi
  pop %%eax
  "
  : : "a" (a), "D" (b), "c" (len), "d" (&sum) : "memory");
  return sum;
}

<p><p>--- >8 ----
List archives:  http://www.xiph.org/archives/
Ogg project homepage: http://www.xiph.org/ogg/
To unsubscribe from this list, send a message to 'cvs-request at xiph.org'
containing only the word 'unsubscribe' in the body.  No subject is needed.
Unsubscribe messages sent to the list will be ignored/filtered.