[opus] [PATCH] Optimize silk_LPC_analysis_filter() for ARM NEON

Thu Jul 28 17:18:39 UTC 2016

Created corresponding unit test.
---
 silk/LPC_analysis_filter.c                         |   8 +-
 silk/SigProc_FIX.h                                 |   8 +-
 silk/arm/LPC_analysis_filter_arm.h                 |  60 +++++++
 silk/arm/LPC_analysis_filter_neon_intr.c           | 176 +++++++++++++++++++++
 silk/arm/arm_silk_map.c                            |  14 ++
 .../test_unit_optimization_LPC_analysis_filter.c   |  85 ++++++++++
 silk_headers.mk                                    |   1 +
 silk_sources.mk                                    |   1 +
 tests/test_unit_optimization.c                     |   2 +
 9 files changed, 348 insertions(+), 7 deletions(-)
 create mode 100644 silk/arm/LPC_analysis_filter_arm.h
 create mode 100644 silk/arm/LPC_analysis_filter_neon_intr.c
 create mode 100644 silk/tests/test_unit_optimization_LPC_analysis_filter.c

diff --git a/silk/LPC_analysis_filter.c b/silk/LPC_analysis_filter.c
index 20330d5..d3027f3 100644
--- a/silk/LPC_analysis_filter.c
+++ b/silk/LPC_analysis_filter.c
@@ -44,9 +44,8 @@ POSSIBILITY OF SUCH DAMAGE.
    current implementation silences by casting to unsigned. Enabling
    this should be safe in pretty much all cases, even though it is not technically
    C89-compliant. */
-#define USE_CELT_FIR 0
 
-void silk_LPC_analysis_filter(
+void silk_LPC_analysis_filter_c(
     opus_int16                  *out,               /* O    Output signal                                               */
     const opus_int16            *in,                /* I    Input signal                                                */
     const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order]                     */
@@ -74,9 +73,6 @@ void silk_LPC_analysis_filter(
         num[ j ] = -B[ j ];
     }
     celt_fir( in + d, num, out + d, len - d, d, arch );
-    for ( j = 0; j < d; j++ ) {
-        out[ j ] = 0;
-    }
 #else
     (void)arch;
     for( ix = d; ix < len; ix++ ) {
@@ -104,8 +100,8 @@ void silk_LPC_analysis_filter(
         /* Saturate output */
         out[ ix ] = (opus_int16)silk_SAT16( out32 );
     }
+#endif
 
     /* Set first d output samples to zero */
     silk_memset( out, 0, d * sizeof( opus_int16 ) );
-#endif
 }
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index 72df6d3..0e619d0 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -35,6 +35,7 @@ extern "C"
 
 /*#define silk_MACRO_COUNT */          /* Used to enable WMOPS counting */
 
+#define USE_CELT_FIR 0
 #define SILK_MAX_ORDER_LPC            24            /* max order of the LPC analysis in schur() and k2a() */
 
 #include <string.h>                                 /* for memset(), memcpy(), memmove() */
@@ -48,6 +49,7 @@ extern "C"
 #endif
 
 #if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/LPC_analysis_filter_arm.h"
 #include "arm/LPC_inv_pred_gain_arm.h"
 #endif
 
@@ -111,7 +113,7 @@ void silk_biquad_alt(
 );
 
 /* Variable order MA prediction error filter. */
-void silk_LPC_analysis_filter(
+void silk_LPC_analysis_filter_c(
     opus_int16                  *out,               /* O    Output signal                                               */
     const opus_int16            *in,                /* I    Input signal                                                */
     const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order]                     */
@@ -156,6 +158,10 @@ void silk_ana_filt_bank_1(
     const opus_int32            N                   /* I    Number of input samples                                     */
 );
 
+#if !defined(OVERRIDE_SILK_LPC_ANALYSIS_FILTER)
+#define silk_LPC_analysis_filter(out, in, B, len, d, arch) (silk_LPC_analysis_filter_c(out, in, B, len, d, arch))
+#endif
+
 #if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)
 #define silk_LPC_inverse_pred_gain(A_Q12, order, arch)     ((void)(arch),silk_LPC_inverse_pred_gain_c(A_Q12, order))
 #endif
diff --git a/silk/arm/LPC_analysis_filter_arm.h b/silk/arm/LPC_analysis_filter_arm.h
new file mode 100644
index 0000000..96615d2
--- /dev/null
+++ b/silk/arm/LPC_analysis_filter_arm.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(LPC_ANALYSIS_FILTER_ARM_H)
+# define LPC_ANALYSIS_FILTER_ARM_H
+
+# include "celt/arm/armcpu.h"
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+void silk_LPC_analysis_filter_neon(
+        opus_int16                  *out,               /* O    Output signal                                               */
+        const opus_int16            *in,                /* I    Input signal                                                */
+        const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order]                     */
+        const opus_int32            len,                /* I    Signal length                                               */
+        const opus_int32            d,                  /* I    Filter order                                                */
+        int                         arch                /* I    Run-time architecture                                       */
+);
+# endif
+
+# if !defined(OPUS_HAVE_RTCD)
+#  define OVERRIDE_SILK_LPC_ANALYSIS_FILTER                   (1)
+#  define silk_LPC_analysis_filter(out, in, B, len, d, arch)  (PRESUME_NEON(silk_LPC_analysis_filter)(out, in, B, len, d, arch))
+# endif
+
+# if !defined(OVERRIDE_SILK_LPC_ANALYSIS_FILTER)
+/*Is run-time CPU detection enabled on this platform?*/
+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern void (*const SILK_LPC_ANALYSIS_FILTER_IMPL[OPUS_ARCHMASK+1])(opus_int16 *out, const opus_int16 *in, const opus_int16 *B, const opus_int32 len, const opus_int32 d, int arch);
+#   define OVERRIDE_SILK_LPC_ANALYSIS_FILTER                  (1)
+#   define silk_LPC_analysis_filter(out, in, B, len, d, arch) ((*SILK_LPC_ANALYSIS_FILTER_IMPL[(arch)&OPUS_ARCHMASK])(out, in, B, len, d, arch))
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_SILK_LPC_ANALYSIS_FILTER                  (1)
+#   define silk_LPC_analysis_filter(out, in, B, len, d, arch) (silk_LPC_analysis_filter_neon(out, in, B, len, d, arch))
+#  endif
+# endif
+
+#endif /* end LPC_ANALYSIS_FILTER_ARM_H */
diff --git a/silk/arm/LPC_analysis_filter_neon_intr.c b/silk/arm/LPC_analysis_filter_neon_intr.c
new file mode 100644
index 0000000..56d26fb
--- /dev/null
+++ b/silk/arm/LPC_analysis_filter_neon_intr.c
@@ -0,0 +1,176 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+   @file celt_lpc_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for celt lpc functions
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "SigProc_FIX.h"
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+
+/*******************************************/
+/* LPC analysis filter                     */
+/* NB! State is kept internally and the    */
+/* filter always starts with zero state    */
+/* first d output samples are set to zero  */
+/*******************************************/
+
+/* OPT: Using celt_fir() for this function should be faster, but it may cause
+   integer overflows in intermediate values (not final results), which the
+   current implementation silences by casting to unsigned. Enabling
+   this should be safe in pretty much all cases, even though it is not technically
+   C89-compliant. */
+
+void silk_LPC_analysis_filter_neon(
+    opus_int16                  *out,               /* O    Output signal                                               */
+    const opus_int16            *in,                /* I    Input signal                                                */
+    const opus_int16            *B,                 /* I    MA prediction coefficients, Q12 [order]                     */
+    const opus_int32            len,                /* I    Signal length                                               */
+    const opus_int32            d,                  /* I    Filter order                                                */
+    int                         arch                /* I    Run-time architecture                                       */
+)
+{
+    opus_int   j;
+#if USE_CELT_FIR
+    opus_int16 num[SILK_MAX_ORDER_LPC];
+#else
+    int ix;
+    const int leftover = (len - d) & 7;
+#endif
+
+    silk_assert( d >= 6 );
+    silk_assert( (d & 1) == 0 );
+    silk_assert( d <= len );
+
+#if USE_CELT_FIR
+    silk_assert( d <= SILK_MAX_ORDER_LPC );
+    for ( j = 0; j < d; j++ ) {
+        num[ j ] = -B[ j ];
+    }
+    celt_fir( in + d, num, out + d, len - d, d, arch );
+#else
+    VARDECL(opus_int16, rB);
+    (void)arch;
+    SAVE_STACK;
+
+    /* Extend rB by 3 zeros to handle the case that (d % 4) is non-zero. */
+    ALLOC(rB, d + 3, opus_int16);
+    for (ix = 0; ix < d - 3; ix += 4) {
+        vst1_s16(rB + ix, vrev64_s16(vld1_s16(B + d - ix - 4)));
+    }
+    for (; ix < d; ix++) {
+        rB[ix] = B[d - ix - 1];
+    }
+    rB[d] = rB[d + 1] = rB[d + 2] = 0;
+
+    for (ix = d; ix < len - 7; ix += 8) {
+        int16x8_t in_s16x8          = vld1q_s16(in + ix);
+        int32x4_t out32_Q12_0_s32x4 = vshll_n_s16(vget_low_s16 (in_s16x8), 12);
+        int32x4_t out32_Q12_1_s32x4 = vshll_n_s16(vget_high_s16(in_s16x8), 12);
+        for (j = 0; j < d; j += 4) {
+            const int16x4_t rB_s16x4 = vld1_s16(rB + j);
+            in_s16x8          = vld1q_s16(in - d + ix + j + 0);
+            out32_Q12_0_s32x4 = vmlsl_lane_s16(out32_Q12_0_s32x4, vget_low_s16 (in_s16x8), rB_s16x4, 0);
+            out32_Q12_1_s32x4 = vmlsl_lane_s16(out32_Q12_1_s32x4, vget_high_s16(in_s16x8), rB_s16x4, 0);
+            in_s16x8          = vld1q_s16(in - d + ix + j + 1);
+            out32_Q12_0_s32x4 = vmlsl_lane_s16(out32_Q12_0_s32x4, vget_low_s16 (in_s16x8), rB_s16x4, 1);
+            out32_Q12_1_s32x4 = vmlsl_lane_s16(out32_Q12_1_s32x4, vget_high_s16(in_s16x8), rB_s16x4, 1);
+            in_s16x8          = vld1q_s16(in - d + ix + j + 2);
+            out32_Q12_0_s32x4 = vmlsl_lane_s16(out32_Q12_0_s32x4, vget_low_s16 (in_s16x8), rB_s16x4, 2);
+            out32_Q12_1_s32x4 = vmlsl_lane_s16(out32_Q12_1_s32x4, vget_high_s16(in_s16x8), rB_s16x4, 2);
+            in_s16x8          = vld1q_s16(in - d + ix + j + 3);
+            out32_Q12_0_s32x4 = vmlsl_lane_s16(out32_Q12_0_s32x4, vget_low_s16 (in_s16x8), rB_s16x4, 3);
+            out32_Q12_1_s32x4 = vmlsl_lane_s16(out32_Q12_1_s32x4, vget_high_s16(in_s16x8), rB_s16x4, 3);
+        }
+        vst1q_s16(out + ix, vcombine_s16(vqrshrn_n_s32(out32_Q12_0_s32x4, 12), vqrshrn_n_s32(out32_Q12_1_s32x4, 12)));
+    }
+    if (leftover) {
+        if (leftover > 4) {
+            int16x8_t in_s16x8          = vld1q_s16(in + ix);
+            int32x4_t out32_Q12_0_s32x4 = vshll_n_s16(vget_low_s16 (in_s16x8), 12);
+            int32x4_t out32_Q12_1_s32x4 = vshll_n_s16(vget_high_s16(in_s16x8), 12);
+            for (j = 0; j < d; j += 4) {
+                const int16x4_t rB_s16x4 = vld1_s16(rB + j);
+                in_s16x8          = vld1q_s16(in - d + ix + j + 0);
+                out32_Q12_0_s32x4 = vmlsl_lane_s16(out32_Q12_0_s32x4, vget_low_s16 (in_s16x8), rB_s16x4, 0);
+                out32_Q12_1_s32x4 = vmlsl_lane_s16(out32_Q12_1_s32x4, vget_high_s16(in_s16x8), rB_s16x4, 0);
+                in_s16x8          = vld1q_s16(in - d + ix + j + 1);
+                out32_Q12_0_s32x4 = vmlsl_lane_s16(out32_Q12_0_s32x4, vget_low_s16 (in_s16x8), rB_s16x4, 1);
+                out32_Q12_1_s32x4 = vmlsl_lane_s16(out32_Q12_1_s32x4, vget_high_s16(in_s16x8), rB_s16x4, 1);
+                in_s16x8          = vld1q_s16(in - d + ix + j + 2);
+                out32_Q12_0_s32x4 = vmlsl_lane_s16(out32_Q12_0_s32x4, vget_low_s16 (in_s16x8), rB_s16x4, 2);
+                out32_Q12_1_s32x4 = vmlsl_lane_s16(out32_Q12_1_s32x4, vget_high_s16(in_s16x8), rB_s16x4, 2);
+                in_s16x8          = vld1q_s16(in - d + ix + j + 3);
+                out32_Q12_0_s32x4 = vmlsl_lane_s16(out32_Q12_0_s32x4, vget_low_s16 (in_s16x8), rB_s16x4, 3);
+                out32_Q12_1_s32x4 = vmlsl_lane_s16(out32_Q12_1_s32x4, vget_high_s16(in_s16x8), rB_s16x4, 3);
+            }
+            const int16x8_t out_s16x8 = vcombine_s16(vqrshrn_n_s32(out32_Q12_0_s32x4, 12), vqrshrn_n_s32(out32_Q12_1_s32x4, 12));
+            vst1_s16      (out + ix,     vget_low_s16(out_s16x8));
+            vst1q_lane_s16(out + ix + 4, out_s16x8, 4);
+            if (leftover >= 6) {
+                vst1q_lane_s16(out + ix + 5, out_s16x8, 5);
+                if (leftover == 7) {
+                    vst1q_lane_s16(out + ix + 6, out_s16x8, 6);
+                }
+            }
+        }
+        else {
+            int32x4_t out32_Q12_0_s32x4 = vshll_n_s16(vld1_s16(in + ix), 12);
+            for (j = 0; j < d; j += 4) {
+                const int16x4_t rB_s16x4 = vld1_s16(rB + j);
+                out32_Q12_0_s32x4 = vmlsl_lane_s16(out32_Q12_0_s32x4, vld1_s16(in - d + ix + j + 0), rB_s16x4, 0);
+                out32_Q12_0_s32x4 = vmlsl_lane_s16(out32_Q12_0_s32x4, vld1_s16(in - d + ix + j + 1), rB_s16x4, 1);
+                out32_Q12_0_s32x4 = vmlsl_lane_s16(out32_Q12_0_s32x4, vld1_s16(in - d + ix + j + 2), rB_s16x4, 2);
+                out32_Q12_0_s32x4 = vmlsl_lane_s16(out32_Q12_0_s32x4, vld1_s16(in - d + ix + j + 3), rB_s16x4, 3);
+            }
+            const int16x4_t out32_Q12_s16x4 = vqrshrn_n_s32(out32_Q12_0_s32x4, 12);
+            if (leftover == 4) {
+                vst1_s16(out + ix, out32_Q12_s16x4);
+            }
+            else {
+               vst1_lane_s16(out + ix, out32_Q12_s16x4, 0);
+               if (leftover >= 2) {
+                   vst1_lane_s16(out + ix + 1, out32_Q12_s16x4, 1);
+                   if (leftover == 3) {
+                       vst1_lane_s16(out + ix + 2, out32_Q12_s16x4, 2);
+                   }
+               }
+            }
+        }
+    }
+    RESTORE_STACK;
+#endif
+
+    /* Set first d output samples to zero */
+    silk_memset( out, 0, d * sizeof( opus_int16 ) );
+}
diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c
index 59ceb6e..b1783c7 100644
--- a/silk/arm/arm_silk_map.c
+++ b/silk/arm/arm_silk_map.c
@@ -36,6 +36,20 @@ POSSIBILITY OF SUCH DAMAGE.
 
 # if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
 
+void (*const SILK_LPC_ANALYSIS_FILTER_IMPL[OPUS_ARCHMASK + 1])( /* O   Returns inverse prediction gain in energy domain, Q30        */
+        opus_int16                  *out,                       /* O    Output signal                                               */
+        const opus_int16            *in,                        /* I    Input signal                                                */
+        const opus_int16            *B,                         /* I    MA prediction coefficients, Q12 [order]                     */
+        const opus_int32            len,                        /* I    Signal length                                               */
+        const opus_int32            d,                          /* I    Filter order                                                */
+        int                         arch                        /* I    Run-time architecture                                       */
+) = {
+      silk_LPC_analysis_filter_c,              /* ARMv4 */
+      silk_LPC_analysis_filter_c,              /* EDSP */
+      silk_LPC_analysis_filter_c,              /* Media */
+      MAY_HAVE_NEON(silk_LPC_analysis_filter), /* Neon */
+};
+
 opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O   Returns inverse prediction gain in energy domain, Q30        */
         const opus_int16            *A_Q12,                             /* I   Prediction coefficients, Q12 [order]                         */
         const opus_int              order                               /* I   Prediction order                                             */
diff --git a/silk/tests/test_unit_optimization_LPC_analysis_filter.c b/silk/tests/test_unit_optimization_LPC_analysis_filter.c
new file mode 100644
index 0000000..eae2daa
--- /dev/null
+++ b/silk/tests/test_unit_optimization_LPC_analysis_filter.c
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+#include <string.h>
+
+#include "modes.h"
+#include "SigProc_FIX.h"
+
+#define MAX_ORDER 32
+
+static int test_silk_LPC_analysis_filter(int arch)
+{
+   opus_int16 out_org[MAX_PERIOD], out_opt[MAX_PERIOD]; /* O    Output signal                                               */
+   opus_int16 in[MAX_PERIOD + MAX_ORDER];               /* I    Input signal                                                */
+   opus_int16 B[MAX_ORDER];                             /* I    MA prediction coefficients, Q12 [order]                     */
+   opus_int32 len;                                      /* I    Signal length                                               */
+   opus_int32 d;                                        /* I    Filter order                                                */
+   unsigned int i;
+
+   printf("%44s() ...", __func__);
+   for(d=6;d<=MAX_ORDER;d+=2)
+   {
+      for(len=d;len<=MAX_PERIOD;len++) /* len is larger than or equal to d. */
+      {
+         for (i=0;i<MAX_PERIOD+MAX_ORDER;++i)
+         {
+            in[i] = (rand() % 32767) - 16384;
+         }
+         for (i=0;i<MAX_PERIOD;++i)
+         {
+            out_org[i] = (rand() % 32767) - 16384;
+         }
+         for (i=0;i<MAX_ORDER;++i)
+         {
+            B[i] = (rand() % 32767) - 16384;
+         }
+         memcpy(out_opt, out_org, sizeof(out_org));
+
+         silk_LPC_analysis_filter_c(out_org, in, B, len, d, arch);
+         silk_LPC_analysis_filter  (out_opt, in, B, len, d, arch);
+         if (memcmp(out_org, out_opt, sizeof(out_org)))
+         {
+            printf("d=%2d len=%3d failed!\nError in lpc unit test!!!\n", d, len);
+            for (i=0;i<sizeof(out_org) / sizeof(*out_org);i++)
+            {
+               if (out_org[i] != out_opt[i])
+               {
+                  printf("out_org[%3d]=%d, out_opt[%3d]=%d\n", i, out_org[i], i, out_opt[i]);
+               }
+            }
+            return -1;
+         }
+      }
+   }
+   printf(" passed!\n");
+   return 0;
+}
diff --git a/silk_headers.mk b/silk_headers.mk
index 10a169a..d3dce08 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -22,6 +22,7 @@ silk/resampler_rom.h \
 silk/resampler_structs.h \
 silk/SigProc_FIX.h \
 silk/x86/SigProc_FIX_sse.h \
+silk/arm/LPC_analysis_filter_arm.h \
 silk/arm/LPC_inv_pred_gain_arm.h \
 silk/arm/macros_armv4.h \
 silk/arm/macros_armv5e.h \
diff --git a/silk_sources.mk b/silk_sources.mk
index ac5647b..d2d5b35 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -85,6 +85,7 @@ silk/x86/VQ_WMat_EC_sse.c
 
 SILK_SOURCES_ARM_NEON_INTR = \
 silk/arm/arm_silk_map.c \
+silk/arm/LPC_analysis_filter_neon_intr.c \
 silk/arm/LPC_inv_pred_gain_neon_intr.c \
 silk/arm/NSQ_neon.c
 
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
index 1e54caf..55425c4 100644
--- a/tests/test_unit_optimization.c
+++ b/tests/test_unit_optimization.c
@@ -45,6 +45,7 @@
 
 #endif
 
+# include "silk/tests/test_unit_optimization_LPC_analysis_filter.c"
 # include "silk/tests/test_unit_optimization_LPC_inv_pred_gain.c"
 
 #define NUM_UNIT_TEST_LOOP 10
@@ -65,6 +66,7 @@ int main(void)
       result |= test_silk_LPC_inverse_pred_gain_Q24(arch);
       result |= test_warped_autocorrelation(arch);
 #endif /* FIXED_POINT */
+      result |= test_silk_LPC_analysis_filter(arch);
       result |= test_silk_LPC_inverse_pred_gain(arch);
    }
    return result;
-- 
2.8.0.rc3.226.g39d4020