[opus] [PATCH 2/5] Optimize fixed-point celt_fir_c() for ARM NEON

Thu Jul 14 00:48:59 UTC 2016

Create the fixed-point intrinsics optimization celt_fir_neon() for ARM NEON.
Create test tests/test_unit_optimization to unit test the optimization.
---
 .gitignore                              |   1 +
 Makefile.am                             |  39 ++++-
 celt/arm/arm_celt_map.c                 |  17 +++
 celt/arm/celt_lpc_arm.h                 |  65 ++++++++
 celt/arm/celt_lpc_neon_intr.c           | 254 ++++++++++++++++++++++++++++++++
 celt/celt_lpc.h                         |   5 +
 celt/tests/test_unit_dft.c              |   1 +
 celt/tests/test_unit_mathops.c          |   1 +
 celt/tests/test_unit_mdct.c             |   1 +
 celt/tests/test_unit_optimization_lpc.c |  96 ++++++++++++
 celt/tests/test_unit_rotation.c         |   1 +
 celt_headers.mk                         |   1 +
 celt_sources.mk                         |   1 +
 tests/test_unit_optimization.c          |  62 ++++++++
 14 files changed, 541 insertions(+), 4 deletions(-)
 create mode 100644 celt/arm/celt_lpc_arm.h
 create mode 100644 celt/arm/celt_lpc_neon_intr.c
 create mode 100644 celt/tests/test_unit_optimization_lpc.c
 create mode 100644 tests/test_unit_optimization.c

diff --git a/.gitignore b/.gitignore
index 33127c9..05d0582 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,7 @@ tests/test_opus_api
 tests/test_opus_decode
 tests/test_opus_encode
 tests/test_opus_padding
+tests/test_unit_optimization
 celt/arm/armopts.s
 celt/dump_modes/dump_modes
 celt/tests/test_unit_cwrs32
diff --git a/Makefile.am b/Makefile.am
index 7a69114..2bfb923 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -84,9 +84,36 @@ pkginclude_HEADERS = include/opus.h include/opus_multistream.h include/opus_type
 noinst_HEADERS = $(OPUS_HEAD) $(SILK_HEAD) $(CELT_HEAD)
 
 if EXTRA_PROGRAMS
-noinst_PROGRAMS = opus_demo repacketizer_demo opus_compare tests/test_opus_api tests/test_opus_encode tests/test_opus_decode tests/test_opus_padding celt/tests/test_unit_cwrs32 celt/tests/test_unit_dft celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_mathops celt/tests/test_unit_mdct celt/tests/test_unit_rotation celt/tests/test_unit_types
-
-TESTS = celt/tests/test_unit_types celt/tests/test_unit_mathops celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_dft celt/tests/test_unit_mdct celt/tests/test_unit_rotation celt/tests/test_unit_cwrs32 tests/test_opus_api tests/test_opus_decode tests/test_opus_encode tests/test_opus_padding
+noinst_PROGRAMS = opus_demo \
+                  repacketizer_demo \
+                  opus_compare \
+                  celt/tests/test_unit_cwrs32 \
+                  celt/tests/test_unit_dft \
+                  celt/tests/test_unit_entropy \
+                  celt/tests/test_unit_laplace \
+                  celt/tests/test_unit_mathops \
+                  celt/tests/test_unit_mdct \
+                  celt/tests/test_unit_rotation \
+                  celt/tests/test_unit_types \
+                  tests/test_opus_api \
+                  tests/test_opus_encode \
+                  tests/test_opus_decode \
+                  tests/test_opus_padding \
+                  tests/test_unit_optimization
+
+TESTS = celt/tests/test_unit_types \
+        celt/tests/test_unit_mathops \
+        celt/tests/test_unit_entropy \
+        celt/tests/test_unit_laplace \
+        celt/tests/test_unit_dft \
+        celt/tests/test_unit_mdct \
+        celt/tests/test_unit_rotation \
+        celt/tests/test_unit_cwrs32 \
+        tests/test_opus_api \
+        tests/test_opus_decode \
+        tests/test_opus_encode \
+        tests/test_opus_padding \
+        tests/test_unit_optimization
 
 opus_demo_SOURCES = src/opus_demo.c
 
@@ -111,6 +138,9 @@ tests_test_opus_decode_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 tests_test_opus_padding_SOURCES = tests/test_opus_padding.c tests/test_opus_common.h
 tests_test_opus_padding_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 
+tests_test_unit_optimization_SOURCES = tests/test_unit_optimization.c
+tests_test_unit_optimization_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
+
 celt_tests_test_unit_cwrs32_SOURCES = celt/tests/test_unit_cwrs32.c
 celt_tests_test_unit_cwrs32_LDADD = $(LIBM)
 
@@ -276,7 +306,8 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): $(top_srcdir)/celt/arm/arm2gnu.pl
 OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \
                     $(celt_tests_test_unit_rotation_SOURCES:.c=.o) \
                     $(celt_tests_test_unit_mdct_SOURCES:.c=.o) \
-                    $(celt_tests_test_unit_dft_SOURCES:.c=.o)
+                    $(celt_tests_test_unit_dft_SOURCES:.c=.o) \
+                    $(tests_test_unit_optimization_SOURCES:.c=.o)
 
 if HAVE_SSE
 SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo)
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 4d4d069..74869ab 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -29,6 +29,7 @@
 #include "config.h"
 #endif
 
+#include "celt_lpc.h"
 #include "pitch.h"
 #include "kiss_fft.h"
 #include "mdct.h"
@@ -39,6 +40,22 @@
 #  if ((defined(OPUS_ARM_MAY_HAVE_NEON) && !defined(OPUS_ARM_PRESUME_NEON)) || \
     (defined(OPUS_ARM_MAY_HAVE_MEDIA) && !defined(OPUS_ARM_PRESUME_MEDIA)) || \
     (defined(OPUS_ARM_MAY_HAVE_EDSP) && !defined(OPUS_ARM_PRESUME_EDSP)))
+void celt_fir_neon(
+         const opus_val16 *_x,
+         const opus_val16 *num,
+         opus_val16 *_y,
+         int N,
+         int ord,
+         int arch);
+
+void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val16 *, int, int, int) = {
+  celt_fir_c,             /* ARMv4 */
+  celt_fir_c,             /* EDSP */
+  celt_fir_c,             /* Media */
+  MAY_HAVE_NEON(celt_fir) /* NEON */
+};
+
 opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
     const opus_val16 *, opus_val32 *, int , int) = {
   celt_pitch_xcorr_c,               /* ARMv4 */
diff --git a/celt/arm/celt_lpc_arm.h b/celt/arm/celt_lpc_arm.h
new file mode 100644
index 0000000..101df3d
--- /dev/null
+++ b/celt/arm/celt_lpc_arm.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(CELT_LPC_ARM_H)
+# define CELT_LPC_ARM_H
+
+# include "armcpu.h"
+
+# if defined(FIXED_POINT)
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+void celt_fir_neon(
+         const opus_val16 *_x,
+         const opus_val16 *num,
+         opus_val16 *_y,
+         int N,
+         int ord,
+         int arch);
+#  endif
+
+#  if !defined(OPUS_HAVE_RTCD)
+#   define OVERRIDE_CELT_FIR (1)
+#   define celt_fir(x, num, y, N, ord, arch) \
+  ((void)(arch),PRESUME_NEON(celt_fir)(x, num, y, N, ord, arch))
+#  endif
+
+#if !defined(OVERRIDE_CELT_FIR)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_ASM) \
+   || (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) \
+   && !defined(OPUS_ARM_PRESUME_NEON_INTR)))
+extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+      const opus_val16 *, opus_val16 *, int, int, int);
+
+#  define OVERRIDE_CELT_FIR
+#   define celt_fir(x, num, y, N, ord, arch) \
+  ((*CELT_FIR_IMPL[(arch)&OPUS_ARCHMASK])(x, num, y, N, ord, arch))
+# endif
+#endif
+#endif /* end FIXED_POINT */
+
+#endif /* end CELT_LPC_ARM_H */
diff --git a/celt/arm/celt_lpc_neon_intr.c b/celt/arm/celt_lpc_neon_intr.c
new file mode 100644
index 0000000..4715d0b
--- /dev/null
+++ b/celt/arm/celt_lpc_neon_intr.c
@@ -0,0 +1,254 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+   @file celt_lpc_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for celt lpc functions
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+
+#if defined(FIXED_POINT)
+
+void celt_fir_neon(
+         const opus_val16 *_x,
+         const opus_val16 *num,
+         opus_val16 *y,
+         int N,
+         int ord,
+         int arch)
+{
+   int i,j;
+   const int leftover = N & 7;
+   const opus_val16 *x = _x-ord;
+   VARDECL(opus_val16, rnum);
+   SAVE_STACK;
+   /* Extend rnum by 3 zeros to handle the case that (ord % 4) is non-zero. */
+   ALLOC(rnum, ord+3, opus_val16);
+   for (i=0;i<ord-3;i+=4)
+      vst1_s16(rnum+i, vrev64_s16(vld1_s16(num+ord-i-4)));
+   for (;i<ord;i++)
+      rnum[i] = num[ord-i-1];
+   rnum[ord] = rnum[ord+1] = rnum[ord+2] = 0;
+   (void)arch;
+
+#ifdef SMALL_FOOTPRINT
+   for (i=0;i<N-7;i+=8)
+   {
+      int16x8_t x_s16x8 = vld1q_s16(_x+i);
+      int32x4_t sum0_s32x4 = vshll_n_s16(vget_low_s16 (x_s16x8), SIG_SHIFT);
+      int32x4_t sum1_s32x4 = vshll_n_s16(vget_high_s16(x_s16x8), SIG_SHIFT);
+      for (j=0;j<ord;j+=4)
+      {
+         const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+         x_s16x8 = vld1q_s16(x+i+j+0);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0);
+         x_s16x8 = vld1q_s16(x+i+j+1);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1);
+         x_s16x8 = vld1q_s16(x+i+j+2);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2);
+         x_s16x8 = vld1q_s16(x+i+j+3);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3);
+      }
+      vst1q_s16(y+i, vcombine_s16(vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT), vqrshrn_n_s32(sum1_s32x4, SIG_SHIFT)));
+   }
+   if (leftover)
+   {
+      if (leftover > 4)
+      {
+         int16x8_t x_s16x8 = vld1q_s16(_x+i);
+         int32x4_t sum0_s32x4 = vshll_n_s16(vget_low_s16 (x_s16x8), SIG_SHIFT);
+         int32x4_t sum1_s32x4 = vshll_n_s16(vget_high_s16(x_s16x8), SIG_SHIFT);
+         for (j=0;j<ord;j+=4)
+         {
+            const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+            x_s16x8 = vld1q_s16(x+i+j+0);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0);
+            x_s16x8 = vld1q_s16(x+i+j+1);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1);
+            x_s16x8 = vld1q_s16(x+i+j+2);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2);
+            x_s16x8 = vld1q_s16(x+i+j+3);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3);
+         }
+         const int16x8_t y_s16x8 = vcombine_s16(vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT), vqrshrn_n_s32(sum1_s32x4, SIG_SHIFT));
+         vst1_s16(y+i, vget_low_s16(y_s16x8));
+         vst1q_lane_s16(y+i+4, y_s16x8, 4);
+         if (leftover >= 6)
+         {
+            vst1q_lane_s16(y+i+5, y_s16x8, 5);
+            if (leftover == 7)
+            {
+               vst1q_lane_s16(y+i+6, y_s16x8, 6);
+            }
+         }
+      }
+      else {
+         int32x4_t sum0_s32x4 = vshll_n_s16(vld1_s16(_x+i), SIG_SHIFT);
+         for (j=0;j<ord;j+=4)
+         {
+            const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+0), rnum_s16x4, 0);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+1), rnum_s16x4, 1);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+2), rnum_s16x4, 2);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+3), rnum_s16x4, 3);
+         }
+         const int16x4_t y_s16x4 = vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT);
+         if (leftover == 4)
+         {
+            vst1_s16(y+i, y_s16x4);
+         }
+         else
+         {
+            vst1_lane_s16(y+i, y_s16x4, 0);
+            if (leftover >= 2)
+            {
+               vst1_lane_s16(y+i+1, y_s16x4, 1);
+               if (leftover == 3)
+               {
+                  vst1_lane_s16(y+i+2, y_s16x4, 2);
+               }
+            }
+         }
+      }
+   }
+#else
+   for (i=0;i<N-7;i+=8)
+   {
+      int32x4_t sum0_s32x4, sum1_s32x4;
+      sum0_s32x4 = sum1_s32x4 = vdupq_n_s32(0);
+      for (j=0;j<ord;j+=4)
+      {
+         const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+         int16x8_t x_s16x8 = vld1q_s16(x+i+j+0);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0);
+         x_s16x8 = vld1q_s16(x+i+j+1);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1);
+         x_s16x8 = vld1q_s16(x+i+j+2);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2);
+         x_s16x8 = vld1q_s16(x+i+j+3);
+         sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3);
+         sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3);
+      }
+      sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT);
+      sum1_s32x4 = vrshrq_n_s32(sum1_s32x4, SIG_SHIFT);
+      const int16x8_t x_s16x8 = vld1q_s16(_x+i);
+      sum0_s32x4 = vaddw_s16(sum0_s32x4, vget_low_s16 (x_s16x8));
+      sum1_s32x4 = vaddw_s16(sum1_s32x4, vget_high_s16(x_s16x8));
+      vst1q_s16(y+i, vcombine_s16(vqmovn_s32(sum0_s32x4), vqmovn_s32(sum1_s32x4)));
+   }
+   if (leftover)
+   {
+      if (leftover > 4)
+      {
+         int32x4_t sum0_s32x4, sum1_s32x4;
+         sum0_s32x4 = sum1_s32x4 = vdupq_n_s32(0);
+         for (j=0;j<ord;j+=4)
+         {
+            const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+            int16x8_t x_s16x8 = vld1q_s16(x+i+j+0);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0);
+            x_s16x8 = vld1q_s16(x+i+j+1);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1);
+            x_s16x8 = vld1q_s16(x+i+j+2);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2);
+            x_s16x8 = vld1q_s16(x+i+j+3);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3);
+            sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3);
+         }
+         sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT);
+         sum1_s32x4 = vrshrq_n_s32(sum1_s32x4, SIG_SHIFT);
+         const int16x8_t x_s16x8 = vld1q_s16(_x+i);
+         sum0_s32x4 = vaddw_s16(sum0_s32x4, vget_low_s16 (x_s16x8));
+         sum1_s32x4 = vaddw_s16(sum1_s32x4, vget_high_s16(x_s16x8));
+         const int16x8_t y_s16x8 = vcombine_s16(vqmovn_s32(sum0_s32x4), vqmovn_s32(sum1_s32x4));
+         vst1_s16(y+i, vget_low_s16(y_s16x8));
+         vst1q_lane_s16(y+i+4, y_s16x8, 4);
+         if (leftover >= 6)
+         {
+            vst1q_lane_s16(y+i+5, y_s16x8, 5);
+            if (leftover == 7)
+            {
+               vst1q_lane_s16(y+i+6, y_s16x8, 6);
+            }
+         }
+      }
+      else {
+         int32x4_t sum0_s32x4 = vdupq_n_s32(0);
+         for (j=0;j<ord;j+=4)
+         {
+            const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+0), rnum_s16x4, 0);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+1), rnum_s16x4, 1);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+2), rnum_s16x4, 2);
+            sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+3), rnum_s16x4, 3);
+         }
+         sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT);
+         sum0_s32x4 = vaddw_s16(sum0_s32x4, vld1_s16(_x+i));
+         const int16x4_t y_s16x4 = vqmovn_s32(sum0_s32x4);
+         if (leftover == 4)
+         {
+            vst1_s16(y+i, y_s16x4);
+         }
+         else
+         {
+            vst1_lane_s16(y+i, y_s16x4, 0);
+            if (leftover >= 2)
+            {
+               vst1_lane_s16(y+i+1, y_s16x4, 1);
+               if (leftover == 3)
+               {
+                  vst1_lane_s16(y+i+2, y_s16x4, 2);
+               }
+            }
+         }
+      }
+   }
+#endif
+   RESTORE_STACK;
+}
+
+#endif
diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index a4c5fd6..76a73c0 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -35,6 +35,11 @@
 #include "x86/celt_lpc_sse.h"
 #endif
 
+#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
+   || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/celt_lpc_arm.h"
+#endif
+
 #define LPC_ORDER 24
 
 void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 6166eb0..582618e 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -52,6 +52,7 @@
 # include "celt_lpc.c"
 # include "pitch.c"
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "mdct.c"
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index fd3319d..da92f16 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -66,6 +66,7 @@
 #elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 # include "arm/armcpu.c"
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "kiss_fft.c"
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 8dbb9ca..0658c7a 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -53,6 +53,7 @@
 # include "pitch.c"
 # include "celt_lpc.c"
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "arm/celt_ne10_fft.c"
diff --git a/celt/tests/test_unit_optimization_lpc.c b/celt/tests/test_unit_optimization_lpc.c
new file mode 100644
index 0000000..7247046
--- /dev/null
+++ b/celt/tests/test_unit_optimization_lpc.c
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+# define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+#include <string.h>
+
+#ifndef CELT_C
+# define CELT_C
+#endif
+#include "celt_lpc.h"
+#include "modes.h"
+
+#ifdef FIXED_POINT
+
+#define MAX_ORDER 32
+
+static int test_fir(int arch)
+{
+   opus_val16 x[MAX_PERIOD+MAX_ORDER];
+   opus_val16 num[MAX_ORDER];
+   opus_val16 yorg[MAX_PERIOD], yopt[MAX_PERIOD];
+   int N, ord;
+   unsigned int i;
+
+   printf("%50s", "celt_fir() ...");
+   for(ord=0;ord<=MAX_ORDER;ord++)
+   {
+      for(N=ord;N<=MAX_PERIOD;N++) /* N is larger than or equal to ord. */
+      {
+         for (i=0;i<MAX_PERIOD+MAX_ORDER;++i)
+         {
+            x[i] = (rand() % 32767) - 16384;
+         }
+         for (i=0;i<MAX_PERIOD;++i)
+         {
+            yorg[i] = (rand() % 32767) - 16384;
+         }
+         for (i=0;i<MAX_ORDER;++i)
+         {
+            num[i] = (rand() % 32767) - 16384;
+         }
+         memcpy(yopt, yorg, sizeof(yorg));
+
+         celt_fir_c(x+MAX_ORDER, num, yorg, N, ord, arch);
+         celt_fir  (x+MAX_ORDER, num, yopt, N, ord, arch);
+         if (memcmp(yorg, yopt, sizeof(yorg)))
+         {
+            printf("ord=%2d N=%3d failed!\nError in lpc unit test!!!\n", ord, N);
+            for (i=0;i<sizeof(yorg) / sizeof(*yorg);i++)
+            {
+               if (yorg[i] != yopt[i])
+               {
+                  printf("yorg[%3d]=%d, yopt[%3d]=%d\n", i, yorg[i], i, yopt[i]);
+               }
+            }
+            return -1;
+         }
+      }
+   }
+   printf(" passed!\n");
+   return 0;
+}
+#endif /* FIXED_POINT */
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index 1080c20..3a85a29 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -64,6 +64,7 @@
 #elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 # include "arm/armcpu.c"
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+#  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "kiss_fft.c"
diff --git a/celt_headers.mk b/celt_headers.mk
index c9df94b..36ae290 100644
--- a/celt_headers.mk
+++ b/celt_headers.mk
@@ -34,6 +34,7 @@ celt/static_modes_fixed.h \
 celt/static_modes_float_arm_ne10.h \
 celt/static_modes_fixed_arm_ne10.h \
 celt/arm/armcpu.h \
+celt/arm/celt_lpc_arm.h \
 celt/arm/fixed_armv4.h \
 celt/arm/fixed_armv5e.h \
 celt/arm/fixed_arm64.h \
diff --git a/celt_sources.mk b/celt_sources.mk
index 2ffe99a..37c0129 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -37,6 +37,7 @@ CELT_AM_SOURCES_ARM_ASM = \
 celt/arm/armopts.s.in
 
 CELT_SOURCES_ARM_NEON_INTR = \
+celt/arm/celt_lpc_neon_intr.c \
 celt/arm/celt_neon_intr.c
 
 CELT_SOURCES_ARM_NE10= \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
new file mode 100644
index 0000000..7eeab38
--- /dev/null
+++ b/tests/test_unit_optimization.c
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include "stack_alloc.h"
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#ifdef FIXED_POINT
+
+# include "celt/tests/test_unit_optimization_lpc.c"
+
+#endif
+
+int main(void)
+{
+   int result = 0; /* 0: passed; other: failed */
+   ALLOC_STACK;
+#ifdef FIXED_POINT
+   int arch = opus_select_arch();
+#endif /* FIXED_POINT */
+   int count = 10;
+
+   while (!result && count--) {
+      printf("\n--------------------------- Testing optimization ---------------------------\n");
+#ifdef FIXED_POINT
+      result |= test_fir(arch);
+#endif /* FIXED_POINT */
+   }
+   return result;
+}
-- 
2.8.0.rc3.226.g39d4020