[xiph-commits] r8855 - trunk/speex/libspeex

Mon Feb 7 00:46:59 PST 2005

Author: jm
Date: 2005-02-07 00:46:56 -0800 (Mon, 07 Feb 2005)
New Revision: 8855

Added:
   trunk/speex/libspeex/filters_arm4.h
   trunk/speex/libspeex/fixed_arm4.h
   trunk/speex/libspeex/ltp_arm4.h
Modified:
   trunk/speex/libspeex/Makefile.am
   trunk/speex/libspeex/arch.h
   trunk/speex/libspeex/filters.c
   trunk/speex/libspeex/ltp.c
Log:
ARM assembly version of pitch_xcorr, moved all ARM assembly to separate files


Modified: trunk/speex/libspeex/Makefile.am
===================================================================

--- trunk/speex/libspeex/Makefile.am	2005-02-07 07:59:33 UTC (rev 8854)
+++ trunk/speex/libspeex/Makefile.am	2005-02-07 08:46:56 UTC (rev 8855)
@@ -59,10 +59,13 @@
 	vbr.h \
 	misc.h \
 	ltp_sse.h \
+	ltp_arm4.h \
 	filters_sse.h \
+	filters_arm4.h \
 	math_approx.h \
 	smallft.h \
 	arch.h \
+	fixed_arm4.h \
 	fixed_arm5e.h \
 	fixed_debug.h \
 	fixed_generic.h \

Modified: trunk/speex/libspeex/arch.h
===================================================================
--- trunk/speex/libspeex/arch.h	2005-02-07 07:59:33 UTC (rev 8854)
+++ trunk/speex/libspeex/arch.h	2005-02-07 08:46:56 UTC (rev 8855)
@@ -66,6 +66,8 @@
 
 #ifdef ARM5E_ASM
 #include "fixed_arm5e.h"
+#elif defined (ARM4_ASM)
+#include "fixed_arm4.h"
 #elif defined (FIXED_DEBUG)
 #include "fixed_debug.h"
 #else

Modified: trunk/speex/libspeex/filters.c
===================================================================
--- trunk/speex/libspeex/filters.c	2005-02-07 07:59:33 UTC (rev 8854)
+++ trunk/speex/libspeex/filters.c	2005-02-07 08:46:56 UTC (rev 8855)
@@ -118,34 +118,8 @@
 
 #ifdef FIXED_POINT
 
-int normalize16(const spx_sig_t *x, spx_word16_t *y, int max_scale, int len)
-{
-   int i;
-   spx_sig_t max_val=1;
-   int sig_shift;
-   
-   for (i=0;i<len;i++)
-   {
-      spx_sig_t tmp = x[i];
-      if (tmp<0)
-         tmp = -tmp;
-      if (tmp >= max_val)
-         max_val = tmp;
-   }
 
-   sig_shift=0;
-   while (max_val>max_scale)
-   {
-      sig_shift++;
-      max_val >>= 1;
-   }
 
-   for (i=0;i<len;i++)
-      y[i] = SHR(x[i], sig_shift);
-   
-   return sig_shift;
-}
-
 spx_word16_t compute_rms(const spx_sig_t *x, int len)
 {
    int i;
@@ -188,95 +162,38 @@
 }
 
 #if defined(ARM4_ASM) || defined(ARM5E_ASM)
-void filter_mem2(const spx_sig_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
-{
-   int i,j;
-   spx_sig_t xi,yi,nyi;
+#include "filters_arm4.h"
+#else
 
-   for (i=0;i<N;i++)
-   {
-      int deadm, deadn, deadd, deadidx, x1, y1, dead1, dead2, dead3, dead4, dead5, dead6;
-      xi=SATURATE(x[i],805306368);
-      yi = SATURATE(ADD32(xi, SHL(mem[0],2)),805306368);
-      nyi = -yi;
-      y[i] = yi;
-      __asm__ __volatile__ (
-            "\tldrsh %6, [%1], #2\n"
-            "\tsmull %8, %9, %4, %6\n"
-            ".filterloop: \n"
-            "\tldrsh %6, [%2], #2\n"
-            "\tldr %10, [%0, #4]\n"
-            "\tmov %8, %8, lsr #15\n"
-            "\tsmull %7, %11, %5, %6\n"
-            "\tadd %8, %8, %9, lsl #17\n"
-            "\tldrsh %6, [%1], #2\n"
-            "\tadd %10, %10, %8\n"
-            "\tsmull %8, %9, %4, %6\n"
-            "\tadd %10, %10, %7, lsr #15\n"
-            "\tsubs %3, %3, #1\n"
-            "\tadd %10, %10, %11, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-            "\t bne .filterloop\n"
 
-            "\tmov %8, %8, lsr #15\n"
-            "\tadd %10, %8, %9, lsl #17\n"
-            "\tldrsh %6, [%2], #2\n"
-            "\tsmull %8, %9, %5, %6\n"
-            "\tadd %10, %10, %8, lsr #15\n"
-            "\tadd %10, %10, %9, lsl #17\n"
-            "\tstr %10, [%0], #4 \n"
-
-         : "=r" (deadm), "=r" (deadn), "=r" (deadd), "=r" (deadidx),
-           "=r" (xi), "=r" (nyi), "=r" (dead1), "=r" (dead2),
-           "=r" (dead3), "=r" (dead4), "=r" (dead5), "=r" (dead6)
-         : "0" (mem), "1" (num+1), "2" (den+1), "3" (ord-1), "4" (xi), "5" (nyi)
-         : "cc", "memory");
+int normalize16(const spx_sig_t *x, spx_word16_t *y, int max_scale, int len)
+{
+   int i;
+   spx_sig_t max_val=1;
+   int sig_shift;
    
+   for (i=0;i<len;i++)
+   {
+      spx_sig_t tmp = x[i];
+      if (tmp<0)
+         tmp = -tmp;
+      if (tmp >= max_val)
+         max_val = tmp;
    }
-}
 
-void iir_mem2(const spx_sig_t *x, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
-{
-   int i,j;
-   spx_sig_t xi,yi,nyi;
-
-   for (i=0;i<N;i++)
+   sig_shift=0;
+   while (max_val>max_scale)
    {
-      int deadm, deadd, deadidx, dead1, dead2, dead3, dead4, dead5, dead6;
-      xi=SATURATE(x[i],805306368);
-      yi = SATURATE(ADD32(xi, SHL(mem[0],2)),805306368);
-      nyi = -yi;
-      y[i] = yi;
-      __asm__ __volatile__ (
-            "\tldrsh %4, [%1], #2\n"
-            "\tsmull %5, %6, %3, %4\n"
+      sig_shift++;
+      max_val >>= 1;
+   }
 
-            ".iirloop: \n"
-            "\tldr %7, [%0, #4]\n"
-
-            "\tldrsh %4, [%1], #2\n"
-            "\tmov %5, %5, lsr #15\n"
-            "\tadd %8, %5, %6, lsl #17\n"
-            "\tsmull %5, %6, %3, %4\n"
-            "\tadd %7, %7, %8\n"
-            "\tstr %7, [%0], #4 \n"
-            "\tsubs %2, %2, #1\n"
-            "\t bne .iirloop\n"
-
-            "\tmov %5, %5, lsr #15\n"
-            "\tadd %7, %5, %6, lsl #17\n"
-            "\tstr %7, [%0], #4 \n"
-
-         : "=r" (deadm), "=r" (deadd), "=r" (deadidx), "=r" (nyi),
-           "=r" (dead1), "=r" (dead2), "=r" (dead3), "=r" (dead4),
-           "=r" (dead5), "=r" (dead6)
-         : "0" (mem), "1" (den+1), "2" (ord-1), "3" (nyi)
-         : "cc", "memory");
+   for (i=0;i<len;i++)
+      y[i] = SHR(x[i], sig_shift);
    
-   }
+   return sig_shift;
 }
 
-#else
 void filter_mem2(const spx_sig_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
 {
    int i,j;

Added: trunk/speex/libspeex/filters_arm4.h
===================================================================
--- trunk/speex/libspeex/filters_arm4.h	2005-02-07 07:59:33 UTC (rev 8854)
+++ trunk/speex/libspeex/filters_arm4.h	2005-02-07 08:46:56 UTC (rev 8855)
@@ -0,0 +1,182 @@
+/* Copyright (C) 2004 Jean-Marc Valin 
+   File: filters_arm4.h
+   ARM4-optimized filtering routines
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+int normalize16(const spx_sig_t *x, spx_word16_t *y, int max_scale, int len)
+{
+   int i;
+   spx_sig_t max_val=1;
+   int sig_shift;
+   int dead1, dead2, dead3, dead4, dead5, dead6;
+
+   __asm__ __volatile__ (
+         "\tmov %1, #1 \n"
+         "\tmov %3, #0 \n"
+
+         ".normalize16loop1: \n"
+
+         "\tldr %4, [%0], #4 \n"
+         "\tcmps %4, %1 \n"
+         "\tmovgt %1, %4 \n"
+         "\tcmps %4, %3 \n"
+         "\tmovlt %3, %4 \n"
+
+         "\tsubs %2, %2, #1 \n"
+         "\tbne .normalize16loop1\n"
+
+         "\trsb %3, %3, #0 \n"
+         "\tcmp %1, %3 \n"
+         "\tmovlt %1, %3 \n"
+   : "=r" (dead1), "=r" (max_val), "=r" (dead3), "=r" (dead4),
+   "=r" (dead5), "=r" (dead6)
+   : "0" (x), "2" (len)
+   : "cc", "memory");
+
+   sig_shift=0;
+   while (max_val>max_scale)
+   {
+      sig_shift++;
+      max_val >>= 1;
+   }
+   
+   __asm__ __volatile__ (
+         ".normalize16loop: \n"
+
+         "\tldr %4, [%0], #4 \n"
+         "\tldr %5, [%0], #4 \n"
+         "\tmov %4, %4, asr %3 \n"
+         "\tstrh %4, [%1], #2 \n"
+         "\tldr %4, [%0], #4 \n"
+         "\tmov %5, %5, asr %3 \n"
+         "\tstrh %5, [%1], #2 \n"
+         "\tldr %5, [%0], #4 \n"
+         "\tmov %4, %4, asr %3 \n"
+         "\tstrh %4, [%1], #2 \n"
+         "\tsubs %2, %2, #1 \n"
+         "\tmov %5, %5, asr %3 \n"
+         "\tstrh %5, [%1], #2 \n"
+
+         "\tbge .normalize16loop\n"
+   : "=r" (dead1), "=r" (dead2), "=r" (dead3), "=r" (dead4),
+   "=r" (dead5), "=r" (dead6)
+   : "0" (x), "1" (y), "2" (len>>2), "3" (sig_shift)
+   : "cc", "memory");
+   return sig_shift;
+}
+
+
+void filter_mem2(const spx_sig_t *x, const spx_coef_t *num, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
+{
+   int i,j;
+   spx_sig_t xi,yi,nyi;
+
+   for (i=0;i<N;i++)
+   {
+      int deadm, deadn, deadd, deadidx, x1, y1, dead1, dead2, dead3, dead4, dead5, dead6;
+      xi=SATURATE(x[i],805306368);
+      yi = SATURATE(ADD32(xi, SHL(mem[0],2)),805306368);
+      nyi = -yi;
+      y[i] = yi;
+      __asm__ __volatile__ (
+            "\tldrsh %6, [%1], #2\n"
+            "\tsmull %8, %9, %4, %6\n"
+            ".filterloop: \n"
+            "\tldrsh %6, [%2], #2\n"
+            "\tldr %10, [%0, #4]\n"
+            "\tmov %8, %8, lsr #15\n"
+            "\tsmull %7, %11, %5, %6\n"
+            "\tadd %8, %8, %9, lsl #17\n"
+            "\tldrsh %6, [%1], #2\n"
+            "\tadd %10, %10, %8\n"
+            "\tsmull %8, %9, %4, %6\n"
+            "\tadd %10, %10, %7, lsr #15\n"
+            "\tsubs %3, %3, #1\n"
+            "\tadd %10, %10, %11, lsl #17\n"
+            "\tstr %10, [%0], #4 \n"
+            "\t bne .filterloop\n"
+
+            "\tmov %8, %8, lsr #15\n"
+            "\tadd %10, %8, %9, lsl #17\n"
+            "\tldrsh %6, [%2], #2\n"
+            "\tsmull %8, %9, %5, %6\n"
+            "\tadd %10, %10, %8, lsr #15\n"
+            "\tadd %10, %10, %9, lsl #17\n"
+            "\tstr %10, [%0], #4 \n"
+
+         : "=r" (deadm), "=r" (deadn), "=r" (deadd), "=r" (deadidx),
+      "=r" (xi), "=r" (nyi), "=r" (dead1), "=r" (dead2),
+      "=r" (dead3), "=r" (dead4), "=r" (dead5), "=r" (dead6)
+         : "0" (mem), "1" (num+1), "2" (den+1), "3" (ord-1), "4" (xi), "5" (nyi)
+         : "cc", "memory");
+   
+   }
+}
+
+void iir_mem2(const spx_sig_t *x, const spx_coef_t *den, spx_sig_t *y, int N, int ord, spx_mem_t *mem)
+{
+   int i,j;
+   spx_sig_t xi,yi,nyi;
+
+   for (i=0;i<N;i++)
+   {
+      int deadm, deadd, deadidx, dead1, dead2, dead3, dead4, dead5, dead6;
+      xi=SATURATE(x[i],805306368);
+      yi = SATURATE(ADD32(xi, SHL(mem[0],2)),805306368);
+      nyi = -yi;
+      y[i] = yi;
+      __asm__ __volatile__ (
+            "\tldrsh %4, [%1], #2\n"
+            "\tsmull %5, %6, %3, %4\n"
+
+            ".iirloop: \n"
+            "\tldr %7, [%0, #4]\n"
+
+            "\tldrsh %4, [%1], #2\n"
+            "\tmov %5, %5, lsr #15\n"
+            "\tadd %8, %5, %6, lsl #17\n"
+            "\tsmull %5, %6, %3, %4\n"
+            "\tadd %7, %7, %8\n"
+            "\tstr %7, [%0], #4 \n"
+            "\tsubs %2, %2, #1\n"
+            "\t bne .iirloop\n"
+
+            "\tmov %5, %5, lsr #15\n"
+            "\tadd %7, %5, %6, lsl #17\n"
+            "\tstr %7, [%0], #4 \n"
+
+         : "=r" (deadm), "=r" (deadd), "=r" (deadidx), "=r" (nyi),
+      "=r" (dead1), "=r" (dead2), "=r" (dead3), "=r" (dead4),
+      "=r" (dead5), "=r" (dead6)
+         : "0" (mem), "1" (den+1), "2" (ord-1), "3" (nyi)
+         : "cc", "memory");
+   
+   }
+}

Added: trunk/speex/libspeex/fixed_arm4.h
===================================================================
--- trunk/speex/libspeex/fixed_arm4.h	2005-02-07 07:59:33 UTC (rev 8854)
+++ trunk/speex/libspeex/fixed_arm4.h	2005-02-07 08:46:56 UTC (rev 8855)
@@ -0,0 +1,173 @@
+/* Copyright (C) 2004 Jean-Marc Valin */
+/**
+   @file fixed_generic.h
+   @brief ARM4 fixed-point operations
+*/
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef FIXED_GENERIC_H
+#define FIXED_GENERIC_H
+
+#define SHR(a,shift) ((a) >> (shift))
+#define SHL(a,shift) ((a) << (shift))
+
+#define SATURATE(x,a) ((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x))
+
+#define ADD16(a,b) ((short)((short)(a)+(short)(b)))
+#define SUB16(a,b) ((a)-(b))
+#define ADD32(a,b) ((a)+(b))
+#define SUB32(a,b) ((a)-(b))
+#define ADD64(a,b) ((a)+(b))
+
+#define PSHR(a,shift) (SHR((a)+(1<<((shift)-1)),shift))
+
+/* result fits in 16 bits */
+#define MULT16_16_16(a,b)     ((((short)(a))*((short)(b))))
+
+#define MULT16_16(a,b)     (((short)(a))*((short)(b)))
+
+
+
+
+#define MAC16_16(c,a,b)     (ADD32((c),MULT16_16((a),(b))))
+#define MULT16_32_Q12(a,b) ADD32(MULT16_16((a),SHR((b),12)), SHR(MULT16_16((a),((b)&0x00000fff)),12))
+#define MULT16_32_Q13(a,b) ADD32(MULT16_16((a),SHR((b),13)), SHR(MULT16_16((a),((b)&0x00001fff)),13))
+#define MULT16_32_Q14(a,b) ADD32(MULT16_16((a),SHR((b),14)), SHR(MULT16_16((a),((b)&0x00003fff)),14))
+
+#define MULT16_32_Q11(a,b) ADD32(MULT16_16((a),SHR((b),11)), SHR(MULT16_16((a),((b)&0x000007ff)),11))
+#define MAC16_32_Q11(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),11)), SHR(MULT16_16((a),((b)&0x000007ff)),11)))
+
+#define MULT16_32_Q15(a,b) ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15))
+#define MAC16_32_Q15(c,a,b) ADD32(c,ADD32(MULT16_16((a),SHR((b),15)), SHR(MULT16_16((a),((b)&0x00007fff)),15)))
+
+
+#define MAC16_16_Q11(c,a,b)     (ADD32((c),SHR(MULT16_16((a),(b)),11)))
+
+#define MULT16_16_Q11(a,b) (SHR(MULT16_16((a),(b)),11))
+#define MULT16_16_Q13(a,b) (SHR(MULT16_16((a),(b)),13))
+#define MULT16_16_Q14(a,b) (SHR(MULT16_16((a),(b)),14))
+#define MULT16_16_Q15(a,b) (SHR(MULT16_16((a),(b)),15))
+
+#define MULT16_16_P13(a,b) (SHR(ADD32(4096,MULT16_16((a),(b))),13))
+#define MULT16_16_P14(a,b) (SHR(ADD32(8192,MULT16_16((a),(b))),14))
+#define MULT16_16_P15(a,b) (SHR(ADD32(16384,MULT16_16((a),(b))),15))
+
+#define MUL_16_32_R15(a,bh,bl) ADD32(MULT16_16((a),(bh)), SHR(MULT16_16((a),(bl)),15))
+
+
+
+//#define DIV32_16(a,b) ((short)(((signed int)(a))/((short)(b))))
+inline short DIV3216(int a, int b)
+{
+   int res=0;
+   int dead1, dead2, dead3, dead4, dead5;
+   __asm__ __volatile__ (
+         "\teor %5, %0, %1\n"
+         "\tmovs %4, %0\n"
+         "\trsbmi %0, %0, #0 \n"
+         "\tmovs %4, %1\n"
+         "\trsbmi %1, %1, #0 \n"
+         "\tmov %4, #1\n"
+
+         "\tsubs %3, %0, %1, asl #14 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #14 \n"
+
+         "\tsubs %3, %0, %1, asl #13 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #13 \n"
+
+         "\tsubs %3, %0, %1, asl #12 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #12 \n"
+
+         "\tsubs %3, %0, %1, asl #11 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #11 \n"
+
+         "\tsubs %3, %0, %1, asl #10 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #10 \n"
+
+         "\tsubs %3, %0, %1, asl #9 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #9 \n"
+
+         "\tsubs %3, %0, %1, asl #8 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #8 \n"
+
+         "\tsubs %3, %0, %1, asl #7 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #7 \n"
+
+         "\tsubs %3, %0, %1, asl #6 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #6 \n"
+         
+         "\tsubs %3, %0, %1, asl #5 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #5 \n"
+
+         "\tsubs %3, %0, %1, asl #4 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #4 \n"
+
+         "\tsubs %3, %0, %1, asl #3 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #3 \n"
+
+         "\tsubs %3, %0, %1, asl #2 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #2 \n"
+
+         "\tsubs %3, %0, %1, asl #1 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4, asl #1 \n"
+
+         "\tsubs %3, %0, %1 \n"
+         "\tmovpl %0, %3 \n"
+         "\torrpl %2, %2, %4 \n"
+
+         "\tmovs %5, %5, lsr #31 \n"
+         "\trsbne %2, %2, #0 \n"
+   : "=r" (dead1), "=r" (dead2), "=r" (res),
+   "=r" (dead3), "=r" (dead4), "=r" (dead5)
+   : "0" (a), "1" (b), "2" (res)
+   : "cc"
+                        );
+   return res;
+}
+
+#define DIV32(a,b) (((signed int)(a))/((signed int)(b)))
+
+
+
+#endif

Modified: trunk/speex/libspeex/ltp.c
===================================================================
--- trunk/speex/libspeex/ltp.c	2005-02-07 07:59:33 UTC (rev 8854)
+++ trunk/speex/libspeex/ltp.c	2005-02-07 08:46:56 UTC (rev 8855)
@@ -45,63 +45,12 @@
 
 #ifdef _USE_SSE
 #include "ltp_sse.h"
+#elif defined (ARM4_ASM) || defined(ARM5E_ASM)
+#include "ltp_arm4.h"
 #else
 
-#if defined(ARM4_ASM) || defined(ARM5E_ASM)
 static spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
 {
-   spx_word32_t sum1=0,sum2=0;
-   spx_word16_t *deadx, *deady;
-   int deadlen, dead1, dead2, dead3, dead4, dead5, dead6;
-   __asm__ __volatile__ (
-         "\tldrsh %5, [%0], #2 \n"
-         "\tldrsh %6, [%1], #2 \n"
-         ".inner_prod_loop:\n"
-         "\tsub %7, %7, %7\n"
-         "\tsub %10, %10, %10\n"
-
-         "\tldrsh %8, [%0], #2 \n"
-         "\tldrsh %9, [%1], #2 \n"
-         "\tmla %7, %5, %6, %7\n"
-         "\tldrsh %5, [%0], #2 \n"
-         "\tldrsh %6, [%1], #2 \n"
-         "\tmla %10, %8, %9, %10\n"
-         "\tldrsh %8, [%0], #2 \n"
-         "\tldrsh %9, [%1], #2 \n"
-         "\tmla %7, %5, %6, %7\n"
-         "\tldrsh %5, [%0], #2 \n"
-         "\tldrsh %6, [%1], #2 \n"
-         "\tmla %10, %8, %9, %10\n"
-
-         "\tldrsh %8, [%0], #2 \n"
-         "\tldrsh %9, [%1], #2 \n"
-         "\tmla %7, %5, %6, %7\n"
-         "\tldrsh %5, [%0], #2 \n"
-         "\tldrsh %6, [%1], #2 \n"
-         "\tmla %10, %8, %9, %10\n"
-         "\tldrsh %8, [%0], #2 \n"
-         "\tldrsh %9, [%1], #2 \n"
-         "\tmla %7, %5, %6, %7\n"
-         "\tldrsh %5, [%0], #2 \n"
-         "\tldrsh %6, [%1], #2 \n"
-         "\tmla %10, %8, %9, %10\n"
-
-         "\tsubs %4, %4, #1\n"
-         "\tadd %2, %2, %7, asr #5\n"
-         "\tadd %3, %3, %10, asr #5\n"
-         "\tbne .inner_prod_loop\n"
-   : "=r" (deadx), "=r" (deady), "=r" (sum1),  "=r" (sum2), "=r" (deadlen),
-   "=r" (dead1), "=r" (dead2), "=r" (dead3), "=r" (dead4), "=r" (dead5), "=r"
-   : "0" (x), "1" (y), "2" (sum1), "3" (sum2), "4" (len>>3)
-   : "cc", "memory"
-                        );
-   return (sum1+sum2)>>1;
-}
-
-
-#else
-static spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
-{
    int i;
    spx_word32_t sum=0;
    for (i=0;i<len;i+=4)
@@ -115,9 +64,8 @@
    }
    return sum;
 }
-#endif
 
-#if 0
+#if 0 /* Enable this for machines with enough registers (i.e. not x86) */
 static void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
 {
    int i,j;
@@ -181,8 +129,7 @@
 
 }
 #else
-static void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word
-32_t *corr, int len, int nb_pitch, char *stack)
+static void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word32_t *corr, int len, int nb_pitch, char *stack)
 {
    int i;
    for (i=0;i<nb_pitch;i++)

Added: trunk/speex/libspeex/ltp_arm4.h
===================================================================
--- trunk/speex/libspeex/ltp_arm4.h	2005-02-07 07:59:33 UTC (rev 8854)
+++ trunk/speex/libspeex/ltp_arm4.h	2005-02-07 08:46:56 UTC (rev 8855)
@@ -0,0 +1,164 @@
+/* Copyright (C) 2004 Jean-Marc Valin 
+   File: ltp.c
+   Lont-Term Prediction functions (SSE version)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+   
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+   
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+   
+   - Neither the name of the Xiph.org Foundation nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+   
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+static spx_word32_t inner_prod(const spx_word16_t *x, const spx_word16_t *y, int
+{
+   spx_word32_t sum1=0,sum2=0;
+   spx_word16_t *deadx, *deady;
+   int deadlen, dead1, dead2, dead3, dead4, dead5, dead6;
+   __asm__ __volatile__ (
+         "\tldrsh %5, [%0], #2 \n"
+         "\tldrsh %6, [%1], #2 \n"
+         ".inner_prod_loop:\n"
+         "\tsub %7, %7, %7\n"
+         "\tsub %10, %10, %10\n"
+
+         "\tldrsh %8, [%0], #2 \n"
+         "\tldrsh %9, [%1], #2 \n"
+         "\tmla %7, %5, %6, %7\n"
+         "\tldrsh %5, [%0], #2 \n"
+         "\tldrsh %6, [%1], #2 \n"
+         "\tmla %10, %8, %9, %10\n"
+         "\tldrsh %8, [%0], #2 \n"
+         "\tldrsh %9, [%1], #2 \n"
+         "\tmla %7, %5, %6, %7\n"
+         "\tldrsh %5, [%0], #2 \n"
+         "\tldrsh %6, [%1], #2 \n"
+         "\tmla %10, %8, %9, %10\n"
+
+         "\tldrsh %8, [%0], #2 \n"
+         "\tldrsh %9, [%1], #2 \n"
+         "\tmla %7, %5, %6, %7\n"
+         "\tldrsh %5, [%0], #2 \n"
+         "\tldrsh %6, [%1], #2 \n"
+         "\tmla %10, %8, %9, %10\n"
+         "\tldrsh %8, [%0], #2 \n"
+         "\tldrsh %9, [%1], #2 \n"
+         "\tmla %7, %5, %6, %7\n"
+         "\tldrsh %5, [%0], #2 \n"
+         "\tldrsh %6, [%1], #2 \n"
+         "\tmla %10, %8, %9, %10\n"
+
+         "\tsubs %4, %4, #1\n"
+         "\tadd %2, %2, %7, asr #5\n"
+         "\tadd %3, %3, %10, asr #5\n"
+         "\tbne .inner_prod_loop\n"
+   : "=r" (deadx), "=r" (deady), "=r" (sum1),  "=r" (sum2), "=r" (deadlen),
+   "=r" (dead1), "=r" (dead2), "=r" (dead3), "=r" (dead4), "=r" (dead5), "=r"
+   : "0" (x), "1" (y), "2" (sum1), "3" (sum2), "4" (len>>3)
+   : "cc", "memory"
+                        );
+   return (sum1+sum2)>>1;
+}
+         
+static void pitch_xcorr(const spx_word16_t *_x, const spx_word16_t *_y, spx_word
+{
+   int i,j;
+   for (i=0;i<nb_pitch;i+=4)
+   {
+      /* Compute correlation*/
+      //corr[nb_pitch-1-i]=inner_prod(x, _y+i, len);
+      spx_word32_t sum1=0;
+      spx_word32_t sum2=0;
+      spx_word32_t sum3=0;
+      spx_word32_t sum4=0;
+      const spx_word16_t *y = _y+i;
+      const spx_word16_t *x = _x;
+      spx_word32_t y0, y1, y2, y3;
+      y0=*y++;
+      y1=*y++;
+      y2=*y++;
+      y3=*y++;
+      for (j=0;j<len;j+=4)
+      {
+         spx_word32_t part1, part2, part3, part4, x0;
+         spx_word16_t z0,z1,z2,z3;
+         __asm__ __volatile__ (
+               "\tldrsh %10, [%8], #2 \n"
+               "\tmul %4, %10, %0 \n"
+               "\tmul %5, %10, %1 \n"
+               "\tmul %6, %10, %2 \n"
+               "\tmul %7, %10, %3 \n"
+
+               "\tldrsh %10, [%8], #2 \n"
+               "\tldrsh %0, [%9], #2 \n"
+               "\tmla %4, %10, %1, %4 \n"
+               "\tmla %5, %10, %2, %5 \n"
+               "\tmla %6, %10, %3, %6 \n"
+               "\tmla %7, %10, %0, %7 \n"
+
+               "\tldrsh %10, [%8], #2 \n"
+               "\tldrsh %1, [%9], #2 \n"
+               "\tmla %4, %10, %2, %4 \n"
+               "\tmla %5, %10, %3, %5 \n"
+               "\tmla %6, %10, %0, %6 \n"
+               "\tmla %7, %10, %1, %7 \n"
+
+               "\tldrsh %10, [%8], #2 \n"
+               "\tldrsh %2, [%9], #2 \n"
+               "\tmla %4, %10, %3, %4 \n"
+               "\tmla %5, %10, %0, %5 \n"
+               "\tmla %6, %10, %1, %6 \n"
+               "\tmla %7, %10, %2, %7 \n"
+
+               "\tldrsh %3, [%9], #2 \n"
+
+
+               "\tldr %10, %11 \n"
+               "\tadd %4, %10, %4, asr #6 \n"
+               "\tldr %10, %12 \n"
+               "\tstr %4, %11 \n"
+               "\tadd %5, %10, %5, asr #6 \n"
+               "\tldr %10, %13 \n"
+               "\tstr %5, %12 \n"
+               "\tadd %6, %10, %6, asr #6 \n"
+               "\tldr %10, %14 \n"
+               "\tstr %6, %13 \n"
+               "\tadd %7, %10, %7, asr #6 \n"
+               "\tstr %7, %14 \n"
+
+            : "=r" (y0), "=r" (y1), "=r" (y2), "=r" (y3),
+         "=r" (part1),  "=r" (part2),  "=r" (part3),  "=r" (part4),
+         "=r" (x), "=r" (y), "=r" (x0),
+         "=m" (sum1), "=m" (sum2), "=m" (sum3), "=m" (sum4)
+            : "0" (y0), "1" (y1), "2" (y2), "3" (y3),
+            "8" (x), "9" (y),
+            "11" (sum1), "12" (sum2), "13" (sum3), "14" (sum4)
+            : "cc", "memory"
+                              );
+      }
+      corr[nb_pitch-1-i]=sum1;
+      corr[nb_pitch-2-i]=sum2;
+      corr[nb_pitch-3-i]=sum3;
+      corr[nb_pitch-4-i]=sum4;
+   }
+
+}