[opus] [PATCH] 02-Add CELT filter optimizations

Tue May 21 05:07:41 PDT 2013

Please ignore my previous mail and patch, there is a new version :).

Patch changes are:
- Use MAC16_16 macros instead of (sum += a*b) and unroll a loop by 2. It 
increase performance when using optimized macros (ex: ARMv5E). A 
possible side effect of loop unroll is that i don't check for odd length 
here.
- Add NEON version of FIR filter and autocorr
- Add a section in autoconf in order to check NEON support

Best regards,
-- 
Aurélien Zanelli
Parrot SA
174, quai de Jemmapes
75010 Paris
France
-------------- next part --------------

diff --git a/celt/celt_lpc.c b/celt/celt_lpc.c
index d2addbf..14a7839 100644
--- a/celt/celt_lpc.c
+++ b/celt/celt_lpc.c
@@ -33,6 +33,10 @@
 #include "stack_alloc.h"
 #include "mathops.h"
 
+#ifdef ARM_HAVE_NEON
+#include "celt_lpc_neon.h"
+#endif
+
 void _celt_lpc(
       opus_val16       *_lpc, /* out: [0...p-1] LPC coefficients      */
 const opus_val32 *ac,  /* in:  [0...p] autocorrelation values  */
@@ -87,6 +91,7 @@ int          p
 #endif
 }
 
+#ifndef OVERRIDE_CELT_FIR
 void celt_fir(const opus_val16 *x,
          const opus_val16 *num,
          opus_val16 *y,
@@ -101,7 +106,7 @@ void celt_fir(const opus_val16 *x,
       opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
       for (j=0;j<ord;j++)
       {
-         sum += MULT16_16(num[j],mem[j]);
+         sum = MAC16_16(sum, num[j], mem[j]);
       }
       for (j=ord-1;j>=1;j--)
       {
@@ -111,6 +116,7 @@ void celt_fir(const opus_val16 *x,
       y[i] = ROUND16(sum, SIG_SHIFT);
    }
 }
+#endif
 
 void celt_iir(const opus_val32 *x,
          const opus_val16 *den,
@@ -136,6 +142,7 @@ void celt_iir(const opus_val32 *x,
    }
 }
 
+#ifndef OVERRIDE_CELT_AUTOCORR
 void _celt_autocorr(
                    const opus_val16 *x,   /*  in: [0...n-1] samples x   */
                    opus_val32       *ac,  /* out: [0...lag-1] ac values */
@@ -163,8 +170,12 @@ void _celt_autocorr(
    {
       opus_val32 ac0=0;
       int shift;
-      for(i=0;i<n;i++)
+      int n2 = n>>1;
+      for(i=0;i<n2;i++)
+      {
          ac0 += SHR32(MULT16_16(xx[i],xx[i]),9);
+         ac0 += SHR32(MULT16_16(xx[n2+i],xx[n2+i]),9);
+      }
       ac0 += 1+n;
 
       shift = celt_ilog2(ac0)-30+10;
@@ -176,7 +187,7 @@ void _celt_autocorr(
    while (lag>=0)
    {
       for (i = lag, d = 0; i < n; i++)
-         d += xx[i] * xx[i-lag];
+         d = MAC16_16(d, xx[i], xx[i-lag]);
       ac[lag] = d;
       /*printf ("%f ", ac[lag]);*/
       lag--;
@@ -186,3 +197,4 @@ void _celt_autocorr(
 
    RESTORE_STACK;
 }
+#endif
diff --git a/celt/celt_lpc_neon.h b/celt/celt_lpc_neon.h
new file mode 100644
index 0000000..e9f76c6
--- /dev/null
+++ b/celt/celt_lpc_neon.h
@@ -0,0 +1,485 @@
+/* Copyright (c) 2013 Parrot */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef CELT_LPC_NEON_H
+#define CELT_LPC_NEON_H
+
+#ifdef FIXED_POINT
+
+#ifndef DISABLE_CELT_FIR_NEON
+#define OVERRIDE_CELT_FIR
+/* Optimized FIR filter for order 1 and 4 which are used by opus encoder
+ * FIR calls in pitch.c are hard-coded with 1 and 4 order values
+ *
+ * TODO: Test one sample by one filtering
+ */
+
+/* Order 1 NEON FIR filter implementation */
+static void celt_fir1(const opus_val16 *x, opus_val16 num, opus_val16 *y,
+    int N, opus_val16 mem)
+{
+  int i;
+
+  __asm__ __volatile__(
+      "vdup.s16 d8, %1;\n" //Duplicate num in d8 lane
+      "vdup.s16 q5, %4;\n" //Duplicate mem in q5 lane
+
+      /* We try to process 16 samples at a time */
+      "movs %5, %3, lsr #4;\n"
+      "beq .celt_fir1_process16_done_%=;\n"
+
+      ".celt_fir1_process16_%=:\n"
+      /* Load 16 x values in q0, q1 lanes */
+      "vld1.16 {q0-q1}, [%0]!;\n"
+
+      /* Init four 32 bits sum in q7, q8, q9, q10 lanes */
+      "vshll.s16 q7, d0, %[SIGSHIFT];\n"
+      "vshll.s16 q8, d1, %[SIGSHIFT];\n"
+      "vshll.s16 q9, d2, %[SIGSHIFT];\n"
+      "vshll.s16 q10, d3, %[SIGSHIFT];\n"
+
+      /* Make previous samples vector for MAC in q5, q6 lanes */
+      "vext.16 q5, q5, q0, #7;\n"
+      "vext.16 q6, q0, q1, #7;\n"
+
+      /* Doing 16 samples filtering at a time */
+      "vmlal.s16 q7, d8, d10;\n"
+      "vmlal.s16 q8, d8, d11;\n"
+      "vmlal.s16 q9, d8, d12;\n"
+      "vmlal.s16 q10, d8, d13;\n"
+      
+      /* Reduce filter sum to 16 bits for y output */
+      "vrshrn.s32 d4, q7, %[SIGSHIFT];\n"
+      "vrshrn.s32 d5, q8, %[SIGSHIFT];\n"
+      "vrshrn.s32 d6, q9, %[SIGSHIFT];\n"
+      "vrshrn.s32 d7, q10, %[SIGSHIFT];\n"
+
+      "pld [%0, #0];\n"
+
+      /* Duplicate last x sample to q5 for next "previous" sample vector 
+       * I know this last sentence is tricky :) */
+      "vdup.s16 q5, d3[3];\n" 
+      
+      /* Store 16 y results */
+      "vst1.16 {q2-q3}, [%2]!;\n"
+
+      "subs %5, %5, #1;\n"
+      "bne .celt_fir1_process16_%=;\n"
+      ".celt_fir1_process16_done_%=:\n"
+
+      /* Check if some samples remains */
+      "ands %5, %3, #15;\n"
+      "beq .celt_fir1_done_%=;\n"
+
+      /* Process remaining samples one by one with NEON 
+       * Previous sample will be store in d11 top in all case,
+       * so we will store top result of vector operation */
+      ".celt_fir1_process_remaining_%=:\n"
+      "vld1.16 d0[0], [%0]!;\n"           //Load x
+      "vshll.s16 q7, d0, %[SIGSHIFT];\n"  //Initialize sum
+      "vmlal.s16 q7, d8, d11;\n"          //Multiply-accumulate
+      "vrshrn.s32 d4, q7, %[SIGSHIFT];\n" //Scale result
+      "vmov.s16 d11, d0;\n"               //Move previous 
+      "vst1.16 d4[3], [%2]!;\n"           //Store result
+
+      "subs %5, %5, #1;\n"
+      "bne .celt_fir1_process_remaining_%=;\n"
+
+      ".celt_fir1_done_%=:\n"
+      : "=r"(x), "=r"(num), "=r"(y), "=r"(N), "=r"(mem), "=r"(i)
+      : "0"(x), "1"(num), "2"(y), "3"(N), "4"(mem), [SIGSHIFT]"I"(SIG_SHIFT)
+        /* Clobber d0-d21 because some gcc version (4.4.3) don't aliase q(x) to 
+         * d(x), d(x+1) */
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", 
+      "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", 
+      "d19", "d20", "d21"
+      );
+}
+
+/* Order 4 FIR filter with NEON */
+static void celt_fir4(const opus_val16 *x, const opus_val16 *num, opus_val16 *y,
+    int N, opus_val16 *mem)
+{
+  int i;
+  
+  __asm__ __volatile__(
+      "vld1.16 {d4}, [%1];\n" //Load num in d4 lane
+      "vld1.16 {d11}, [%4];\n" //Load provided mem in d11 lane
+
+      /* We try to process 16 samples at a time */
+      "movs %5, %3, lsr #4;\n"
+      "beq .celt_fir4_process16_done_%=;\n"
+
+      /* Reverse provided mem order because we will process in reverse order */
+      "vrev64.16 d11, d11;\n"
+
+      ".celt_fir4_process16_%=:\n"
+      /* Load 16 x values in q0, q1 lanes */
+      "vld1.16 {q0-q1}, [%0]!;\n"
+
+      /* Init four 32 bits sum in q7, q8, q9, q10 lanes */
+      "vshll.s16 q7, d0, %[SIGSHIFT];\n"
+      "vshll.s16 q8, d1, %[SIGSHIFT];\n"
+      "vshll.s16 q9, d2, %[SIGSHIFT];\n"
+      "vshll.s16 q10, d3, %[SIGSHIFT];\n"
+
+      /* Build previous sample vector which will be used in filter
+       * each sample will need the four previous sample.
+       * We use q lanes to store it  */
+      "vext.16 q5, q5, q0, #4;\n"
+      "vext.16 q6, q0, q1, #4;\n"
+
+      /* Doing 16 samples filtering at a time and use reverse order filter
+       * begin to mla last sample and coef */
+      "vmlal.s16 q7, d10, d4[3];\n"
+      "vmlal.s16 q8, d11, d4[3];\n"
+      "vmlal.s16 q9, d12, d4[3];\n"
+      "vmlal.s16 q10, d13, d4[3];\n"
+      
+      /* Prepare samples for n-3 sample processing */
+      "vext.16 q5, q5, q6, #1;\n"
+      "vext.16 d12, d12, d13, #1\n"
+      "vext.16 d22, d13, d3, #1\n" //We use d22 because we need to access d3 by group
+
+      "vmlal.s16 q7, d10, d4[2];\n"
+      "vmlal.s16 q8, d11, d4[2];\n"
+      "vmlal.s16 q9, d12, d4[2];\n"
+      "vmlal.s16 q10, d22, d4[2];\n"
+      
+      /* Prepare samples for n-2 sample processing */
+      "vext.16 q5, q5, q6, #1;\n"
+      "vext.16 d12, d12, d22, #1\n"
+      "vext.16 d22, d13, d3, #2\n"
+      
+      "vmlal.s16 q7, d10, d4[1];\n"
+      "vmlal.s16 q8, d11, d4[1];\n"
+      "vmlal.s16 q9, d12, d4[1];\n"
+      "vmlal.s16 q10, d22, d4[1];\n"
+      
+      /* Prepare sample for n-1 sample processing */
+      "vext.16 q5, q5, q6, #1;\n"
+      "vext.16 d12, d12, d22, #1\n"
+      "vext.16 d22, d13, d3, #3\n"
+      
+      "vmlal.s16 q7, d10, d4[0];\n"
+      "vmlal.s16 q8, d11, d4[0];\n"
+      "vmlal.s16 q9, d12, d4[0];\n"
+      "vmlal.s16 q10, d22, d4[0];\n"
+      
+      /* Reduce filter sum to 16 bits for y output */
+      "vrshrn.s32 d6, q7, %[SIGSHIFT];\n"
+      "vrshrn.s32 d7, q8, %[SIGSHIFT];\n"
+      "vrshrn.s32 d8, q9, %[SIGSHIFT];\n"
+      "vrshrn.s32 d9, q10, %[SIGSHIFT];\n"
+
+      "pld [%0, #0];\n"
+
+      /* Duplicate last four x sample to d11 for next "previous" sample vector 
+       * I know this last sentence is tricky :) */
+      "vmov.s16 d11, d3;\n" 
+      
+      /* Store 16 y results */
+      "vst1.16 {q3-q4}, [%2]!;\n"
+
+      "subs %5, %5, #1;\n"
+      "bne .celt_fir4_process16_%=;\n"
+      ".celt_fir4_process16_done_%=:\n"
+
+      /* Check if some samples remains */
+      "ands %5, %3, #15;\n"
+      "beq .celt_fir4_done_%=;\n"
+      
+      /* Process remaining samples one by one with NEON 
+       * Previous sample will be store in d11 top in all case,
+       * so we will store reduce the 4 four top result of vector operation */
+      ".celt_fir4_process_remaining_%=:\n"
+      "vld1.16 d0[0], [%0]!;\n"          //Load x
+      "vshll.s16 q7, d0, %[SIGSHIFT];\n" //Initialize sum
+      "vmull.s16 q8, d4, d11;\n"         //Multiply-accumulate
+      "vadd.s32 d16, d16, d17;\n"        //Three next instructions reduce the sum
+      "vpadd.s32 d16, d16;\n"
+      "vadd.s16 d14, d14, d16;\n"
+      "vrshrn.s32 d6, q7, %[SIGSHIFT];\n" //Scale result to 16 bits
+      "vmov.s16 d11, d0;\n"               //Move previous 
+      "vst1.16 d6[0], [%2]!;\n"           //Store result
+
+      "subs %5, %5, #1;\n"
+      "bne .celt_fir4_process_remaining_%=;\n"
+
+      ".celt_fir4_done_%=:\n"
+      : "=r"(x), "=r"(num), "=r"(y), "=r"(N), "=r"(mem), "=r"(i)
+      : "0"(x), "1"(num), "2"(y), "3"(N), "4"(mem), [SIGSHIFT]"I"(SIG_SHIFT)
+        /* Clobber d0-d21 because some gcc version (4.4.3) don't aliase q(x) to 
+         * d(x), d(x+1) */
+      : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8",
+      "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", 
+      "d19", "d20", "d21", "d22"
+      );
+}
+
+void celt_fir(const opus_val16 *x, const opus_val16 *num, opus_val16 *y,
+    int N, int ord, opus_val16 *mem)
+{
+  int i,j;
+
+  switch(ord)
+  {
+    case 1:
+      celt_fir1(x, *num, y, N, *mem);
+      break;
+
+    case 4:
+      celt_fir4(x, num, y, N, mem);
+      break;
+    
+    default:
+      for (i=0;i<N;i++)
+      {
+        opus_val32 sum = SHL32(EXTEND32(x[i]), SIG_SHIFT);
+        for (j=0;j<ord;j++)
+          sum = MAC16_16(sum, num[j],mem[j]);
+        for (j=ord-1;j>=1;j--)
+          mem[j]=mem[j-1];
+        mem[0] = x[i];
+        y[i] = ROUND16(sum, SIG_SHIFT);
+      }
+    break;
+  }
+}
+#endif /* CELT_FIR_NEON */
+
+
+#ifndef DISABLE_CELT_AUTOCORR_NEON
+#define OVERRIDE_CELT_AUTOCORR
+void _celt_autocorr(
+		const opus_val16 *x,   /*  in: [0...n-1] samples x   */
+		opus_val32       *ac,  /* out: [0...lag-1] ac values */
+		const opus_val16       *window,
+		int          overlap,
+		int          lag,
+		int          n
+		)
+{
+	opus_val32 d;
+	int i;
+	VARDECL(opus_val16, xx);
+	SAVE_STACK;
+	ALLOC(xx, n, opus_val16);
+	celt_assert(n>0);
+	celt_assert(overlap>=0);
+	for (i=0;i<n;i++)
+		xx[i] = x[i];
+	
+	{
+		opus_val16 * xxbeg = xx;
+		opus_val16 * xxend = xx+n-1;
+		const opus_val16 * xbeg = x;
+		const opus_val16 * xend = x+n-1;
+		int scratch0, scratch1, scratch2, scratch3, scratch4;	
+		__asm__ __volatile__(
+				"movs %6, %5, lsr #3;\n"
+				"beq .celt_autocorr_process8_done_%=;\n"
+
+				/* Process 8 samples at a time */
+				".celt_autocorr_process8_%=:\n"
+				"subs %3, %3, #16;\n"
+				"subs %1, %1, #16;\n"
+				"vld1.16 {q2}, [%4]!;\n" //Load 8 window values
+				"vld1.16 {q0}, [%2]!;\n" //Load 8 x values from beg
+				"vld1.16 {q1}, [%3];\n"  //Load 8 x values from end
+
+				/* MULT16_16_Q15(x[i],window[i]) */
+				"vmull.s16 q3, d0, d4;\n"
+				"vmull.s16 q4, d1, d5;\n"
+
+				"pld [%4, #0];\n"
+				
+				/* MULT16_16_Q15(x[n-i-1],window[i]) */
+				"vmull.s16 q5, d2, d4;\n"
+				"vmull.s16 q6, d3, d5;\n"
+
+				"pld [%2, #0];\n"
+				
+				/* Shift right by 15 */
+				"vshrn.s32 d0, q3, #15;\n"
+				"vshrn.s32 d1, q4, #15;\n"
+				"vshrn.s32 d2, q5, #15;\n"
+				"vshrn.s32 d3, q6, #15;\n"
+
+				"pld [%3, #-16];\n"
+
+				"vst1.16 {q0}, [%0]!;\n"
+				"vst1.16 {q1}, [%1];\n"
+
+				"subs %6, %6, #1;\n"
+				"bne .celt_autocorr_process8_%=;\n"
+				".celt_autocorr_process8_done_%=:\n"
+
+				"ands %6, %5, #7;\n"
+				"beq .celt_autocorr_done_%=;\n"
+
+				/* Process remaining sample */
+				".celt_autocorr_process_remaining_%=:\n"
+				"subs %3, %3, #2;\n"
+				"subs %1, %1, #2;\n"
+				"vld1.16 d4[0], [%4]!;\n" //Load 1 window value
+				"vld1.16 d0[0], [%2]!;\n" //Load 1 x value from beg
+				"vld1.16 d0[1], [%3];\n"  //Load 1 x value from end
+
+				"vmull.s16 q3, d0, d4[0];\n"
+				"vshrn.s32 d0, q3, #15;\n"
+
+				"vst1.16 d0[0], [%0]!;\n"
+				"vst1.16 d0[1], [%1];\n"
+
+				"subs %6, %6, #1;\n"
+				"bne .celt_autocorr_process_remaining_%=;\n"
+				".celt_autocorr_done_%=:\n"
+				: "=r"(scratch0), "=r"(scratch1), "=r"(scratch2), "=r"(scratch3),
+				"=r"(scratch4), "=r"(overlap), "=r"(i)
+				: "0"(xxbeg), "1"(xxend), "2"(xbeg), "3"(xend), "4"(window), "5"(overlap)
+        /* Clobber d0-d13 because some gcc version (4.4.3) don't aliase q(x) to 
+         * d(x), d(x+1) */
+				: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8",
+				"d9", "d10", "d11", "d12", "d13"
+				);
+	}
+	
+
+	{
+		opus_val32 ac0;
+		int shift;
+		int scratch1;
+		__asm__ __volatile__(
+				"veor.s32 q0, q0, q0;\n"
+				"movs %3, %2, lsr #3;\n"
+				"beq .celt_autocorr_process8_done_%=;\n"
+				
+				/* Process 8 samples at a time */
+				".celt_autocorr_process8_%=:\n"
+				"vld1.16 {q1}, [%1]!\n"   //Load 8 xx values
+				"subs %3, %3, #1;\n"
+				"vmull.s16 q2, d2, d2;\n" //MULT16_16(xx[i], xx[i]
+				"vmull.s16 q3, d3, d3;\n" //MULT16_16(xx[i], xx[i]
+				"pld [%1, #0];\n"
+				"vsra.s32 q0, q2, #9;\n"  //Shift right by 9 and accumulate to ac0
+				"vsra.s32 q0, q3, #9;\n"  //Shift right by 9 and accumulate to ac0
+				"bne .celt_autocorr_process8_%=;\n"
+
+				".celt_autocorr_process8_done_%=:\n"
+				"ands %3, %2, #7;\n"
+				"beq .celt_autocorr_process_remaining_done_%=;\n"
+
+				/* Process remaining samples */
+				"veor.s16 q1, q1, q1;\n"  //Clear q1 to not accumulate bad values
+				".celt_autocorr_process_remaining_%=:\n"
+				"vld1.16 d2[0], [%1]!;\n" //Load 1 xx values
+				"subs %3, %3, #1;\n"
+				"vmull.s16 q2, d2, d2;\n" //MULT16_16(xx[i], xx[i]
+				"vsra.s32 q0, q2, #9;\n"  //Shift right by 9 and accumulate to ac0
+				"bne .celt_autocorr_process_remaining_%=;\n"
+				".celt_autocorr_process_remaining_done_%=:\n"
+			
+				/* Reduce sum and move result to ARM register */
+				"vadd.s32 d0, d0, d1;\n"
+				"vpadd.s32 d0, d0;\n"
+				"vmov.s32 %0, d0[0];\n"
+				: "=r"(ac0), "=r"(scratch1), "=r"(n), "=r"(i)
+				: "1"(xx), "2"(n)
+        /* Clobber d0-d7 because some gcc version (4.4.3) don't aliase q(x) to 
+         * d(x), d(x+1) */
+				: "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
+				);
+		ac0 += 1+n;
+
+		shift = celt_ilog2(ac0)-30+10;
+		shift = (shift+1)/2;
+		for(i=0;i<n;i++)
+			xx[i] = VSHR32(xx[i], shift);
+	}
+
+	while (lag>=0)
+	{
+		opus_val16 * xx1 = xx+lag;
+		opus_val16 * xx2 = xx;
+		int scratch4, scratch5;
+
+		__asm__ __volatile__(
+				"veor.s32 q0, q0;\n"     //Clear sum, q0 will contain 4 sum
+				"movs %3, %4, lsr #4;\n" //(n-lag)/16
+				"beq .celt_autocorr_process16_done_%=;\n"
+
+				/* Process 16 samples at a time */
+				".celt_autocorr_process16_%=:\n"
+				"vld1.16 {q1-q2}, [%1]!;\n" //Load 16 xx values from xx+lag=xx[i]
+				"vld1.16 {q3-q4}, [%2]!;\n" //Load 16 xx values from xx=xx[i-lag]
+				"vmlal.s16 q0, d2, d6;\n"   //MAC16_16(d, xx[i], xx[i-lag])
+				"vmlal.s16 q0, d3, d7;\n"   //MAC16_16(d, xx[i], xx[i-lag]);
+				"pld [%1, #0];\n"
+				"pld [%2, #0];\n"
+				"vmlal.s16 q0, d4, d8;\n"   //MAC16_16(d, xx[i], xx[i-lag]);
+				"vmlal.s16 q0, d5, d9;\n"   //MAC16_16(d, xx[i], xx[i-lag]);
+				"pld [%1, #16];\n"
+				"pld [%2, #16];\n"
+				"subs %3, %3, #1;\n"
+				"bne .celt_autocorr_process16_%=;\n"
+				".celt_autocorr_process16_done_%=:\n"
+
+				"ands %3, %4, #15;\n"    //(n-lag)&15
+				"beq .celt_autocorr_process_remaining_done_%=;\n"
+				"veor.s32 q1, q1, q1;\n" //Clear q1
+
+				/* Process remaining samples one by one */
+				".celt_autocorr_process_remaining_%=:\n"
+				"vld1.16 d2[0], [%1]!;\n" //Load 1 xx value from xx+lag=xx[i]
+				"vld1.16 d3[0], [%2]!;\n" //Load 1 xx value from xx=xx[i-lag]
+				"subs %3, %3, #1;\n"
+				"vmlal.s16 q0, d2, d3;\n" //MAC16_16(d, xx[i], xx[i-lag])
+				"bne .celt_autocorr_process_remaining_%=;\n"
+				".celt_autocorr_process_remaining_done_%=:\n"
+
+				/* Reduce sum and store it */
+				"vadd.s32 d0, d0, d1;\n"
+				"vpadd.s32 d0, d0;\n"
+				"vst1.32 d0[0], [%5];\n"
+				: "=&r"(d), "=r"(xx1), "=r"(xx2), "=r"(i), "=r"(scratch4), "=r"(scratch5)
+				: "0"(0), "1"(xx1), "2"(xx2), "4"(n-lag), "5"(ac+lag)
+        /* Clobber d0-d9 because some gcc version (4.4.3) don't aliase q(x) to 
+         * d(x), d(x+1) */
+				: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8",
+				"d9"
+				);
+		lag--;
+	}
+	ac[0] += 10;
+
+	RESTORE_STACK;
+}
+#endif /* CELT_AUTOCORR_NEON */
+
+#endif /* FIXED_POINT */
+
+#endif
diff --git a/configure.ac b/configure.ac
index 0c6d725..a36d403 100644
--- a/configure.ac
+++ b/configure.ac
@@ -178,6 +178,11 @@ if test "x${ac_enable_asm}" = xyes ; then
                 AC_DEFINE(ARMv6_ASM, 1, [Use ARMv6 asm optimizations])
                 asm_optimization="${asm_optimization} (Media)"
             fi
+            AS_ASM_ARM_NEON([ARM_HAVE_NEON=1],[ARM_HAVE_NEON=0])
+            if test "x${ARM_HAVE_NEON}" = "x1" ; then
+              AC_DEFINE([ARM_HAVE_NEON], 1, [Use ARM NEON optimizations])
+              asm_optimization="${asm_optimization} (NEON)"
+            fi
         fi
         ;;
     esac