[opus] [PATCH 5/5] Optimize silk/LPC_inv_pred_gain.c for ARM NEON

Thu Jul 14 00:49:02 UTC 2016

Optimized LPC_inverse_pred_gain_QA(), silk_LPC_inverse_pred_gain() and
silk_LPC_inverse_pred_gain_Q24() for ARM NEON.
Created corresponding unit test.
---
 silk/CNG.c                                         |   2 +-
 silk/LPC_inv_pred_gain.c                           |  18 +-
 silk/NLSF2A.c                                      |   3 +-
 silk/SigProc_FIX.h                                 |  19 +-
 silk/arm/LPC_inv_pred_gain_arm.h                   |  84 +++++++
 silk/arm/LPC_inv_pred_gain_neon_intr.c             | 258 +++++++++++++++++++++
 silk/arm/arm_silk_map.c                            |  24 +-
 silk/decode_parameters.c                           |   4 +-
 silk/fixed/find_LPC_FIX.c                          |   2 +-
 silk/float/find_LPC_FLP.c                          |   2 +-
 silk/float/main_FLP.h                              |   3 +-
 silk/float/wrappers_FLP.c                          |   5 +-
 silk/init_decoder.c                                |   1 +
 silk/process_NLSFs.c                               |   4 +-
 silk/structs.h                                     |   1 +
 .../test_unit_optimization_LPC_inv_pred_gain.c     | 107 +++++++++
 silk_headers.mk                                    |   1 +
 silk_sources.mk                                    |   1 +
 tests/test_unit_optimization.c                     |   9 +-
 19 files changed, 523 insertions(+), 25 deletions(-)
 create mode 100644 silk/arm/LPC_inv_pred_gain_arm.h
 create mode 100644 silk/arm/LPC_inv_pred_gain_neon_intr.c
 create mode 100644 silk/tests/test_unit_optimization_LPC_inv_pred_gain.c

diff --git a/silk/CNG.c b/silk/CNG.c
index 8443ad6..78d500a 100644
--- a/silk/CNG.c
+++ b/silk/CNG.c
@@ -142,7 +142,7 @@ void silk_CNG(
         silk_CNG_exc( CNG_sig_Q14 + MAX_LPC_ORDER, psCNG->CNG_exc_buf_Q14, length, &psCNG->rand_seed );
 
         /* Convert CNG NLSF to filter representation */
-        silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order );
+        silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order, psDec->arch );
 
         /* Generate CNG signal, by synthesis filtering */
         silk_memcpy( CNG_sig_Q14, psCNG->CNG_synth_state, MAX_LPC_ORDER * sizeof( opus_int32 ) );
diff --git a/silk/LPC_inv_pred_gain.c b/silk/LPC_inv_pred_gain.c
index 4af89aa..64747ad 100644
--- a/silk/LPC_inv_pred_gain.c
+++ b/silk/LPC_inv_pred_gain.c
@@ -36,9 +36,11 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #define MUL32_FRAC_Q(a32, b32, Q)   ((opus_int32)(silk_RSHIFT_ROUND64(silk_SMULL(a32, b32), Q)))
 
-/* Compute inverse of LPC prediction gain, and                          */
-/* test if LPC coefficients are stable (all poles within unit circle)   */
-static opus_int32 LPC_inverse_pred_gain_QA(                 /* O   Returns inverse prediction gain in energy domain, Q30    */
+/* Compute inverse of LPC prediction gain, and                                                  */
+/* test if LPC coefficients are stable (all poles within unit circle)                           */
+/* Note that specific platforms' optimizations don't guarantee identical A_QA buffer.           */
+/* Since the intermediate A_QA buffer is never used again in the caller functions, that's fine. */
+static opus_int32 LPC_inverse_pred_gain_QA_c(               /* O   Returns inverse prediction gain in energy domain, Q30    */
     opus_int32           A_QA[ 2 ][ SILK_MAX_ORDER_LPC ],   /* I   Prediction coefficients                                  */
     const opus_int       order                              /* I   Prediction order                                         */
 )
@@ -106,7 +108,7 @@ static opus_int32 LPC_inverse_pred_gain_QA(                 /* O   Returns inver
 }
 
 /* For input in Q12 domain */
-opus_int32 silk_LPC_inverse_pred_gain(              /* O   Returns inverse prediction gain in energy domain, Q30        */
+opus_int32 silk_LPC_inverse_pred_gain_c(            /* O   Returns inverse prediction gain in energy domain, Q30        */
     const opus_int16            *A_Q12,             /* I   Prediction coefficients, Q12 [order]                         */
     const opus_int              order               /* I   Prediction order                                             */
 )
@@ -127,13 +129,14 @@ opus_int32 silk_LPC_inverse_pred_gain(              /* O   Returns inverse predi
     if( DC_resp >= 4096 ) {
         return 0;
     }
-    return LPC_inverse_pred_gain_QA( Atmp_QA, order );
+    return LPC_inverse_pred_gain_QA_c( Atmp_QA, order );
+    /* Don't use Atmp_QA buffer anymore from here, because specific platforms' optimizations don't guarantee identical values. */
 }
 
 #ifdef FIXED_POINT
 
 /* For input in Q24 domain */
-opus_int32 silk_LPC_inverse_pred_gain_Q24(          /* O    Returns inverse prediction gain in energy domain, Q30       */
+opus_int32 silk_LPC_inverse_pred_gain_Q24_c(        /* O    Returns inverse prediction gain in energy domain, Q30       */
     const opus_int32            *A_Q24,             /* I    Prediction coefficients [order]                             */
     const opus_int              order               /* I    Prediction order                                            */
 )
@@ -149,6 +152,7 @@ opus_int32 silk_LPC_inverse_pred_gain_Q24(          /* O    Returns inverse pred
         Anew_QA[ k ] = silk_RSHIFT32( A_Q24[ k ], 24 - QA );
     }
 
-    return LPC_inverse_pred_gain_QA( Atmp_QA, order );
+    return LPC_inverse_pred_gain_QA_c( Atmp_QA, order );
+    /* Don't use Atmp_QA buffer anymore from here, because specific platforms' optimizations don't guarantee identical values. */
 }
 #endif
diff --git a/silk/NLSF2A.c b/silk/NLSF2A.c
index b1c559e..a259212 100644
--- a/silk/NLSF2A.c
+++ b/silk/NLSF2A.c
@@ -66,7 +66,8 @@ static OPUS_INLINE void silk_NLSF2A_find_poly(
 void silk_NLSF2A(
     opus_int16                  *a_Q12,             /* O    monic whitening filter coefficients in Q12,  [ d ]          */
     const opus_int16            *NLSF,              /* I    normalized line spectral frequencies in Q15, [ d ]          */
-    const opus_int              d                   /* I    filter order (should be even)                               */
+    const opus_int              d,                  /* I    filter order (should be even)                               */
+    int                         arch                /* I    Run-time architecture                                       */
 )
 {
     /* This ordering was found to maximize quality. It improves numerical accuracy of
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index b632994..570ae11 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -47,6 +47,10 @@ extern "C"
 #include "x86/SigProc_FIX_sse.h"
 #endif
 
+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/LPC_inv_pred_gain_arm.h"
+#endif
+
 /********************************************************************/
 /*                    SIGNAL PROCESSING FUNCTIONS                   */
 /********************************************************************/
@@ -132,13 +136,13 @@ void silk_bwexpander_32(
 
 /* Compute inverse of LPC prediction gain, and                           */
 /* test if LPC coefficients are stable (all poles within unit circle)    */
-opus_int32 silk_LPC_inverse_pred_gain(              /* O   Returns inverse prediction gain in energy domain, Q30        */
+opus_int32 silk_LPC_inverse_pred_gain_c(            /* O   Returns inverse prediction gain in energy domain, Q30        */
     const opus_int16            *A_Q12,             /* I   Prediction coefficients, Q12 [order]                         */
     const opus_int              order               /* I   Prediction order                                             */
 );
 
 /* For input in Q24 domain */
-opus_int32 silk_LPC_inverse_pred_gain_Q24(          /* O    Returns inverse prediction gain in energy domain, Q30       */
+opus_int32 silk_LPC_inverse_pred_gain_Q24_c(        /* O    Returns inverse prediction gain in energy domain, Q30       */
     const opus_int32            *A_Q24,             /* I    Prediction coefficients [order]                             */
     const opus_int              order               /* I    Prediction order                                            */
 );
@@ -152,6 +156,14 @@ void silk_ana_filt_bank_1(
     const opus_int32            N                   /* I    Number of input samples                                     */
 );
 
+#if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)
+#define silk_LPC_inverse_pred_gain(A_Q12, order)     ((void)(arch),silk_LPC_inverse_pred_gain_c(A_Q12, order))
+#endif
+
+#if !defined(OVERRIDE_silk_LPC_inverse_pred_gain_Q24)
+#define silk_LPC_inverse_pred_gain_Q24(A_Q24, order) ((void)(arch),silk_LPC_inverse_pred_gain_Q24_c(A_Q24, order))
+#endif
+
 /********************************************************************/
 /*                        SCALAR FUNCTIONS                          */
 /********************************************************************/
@@ -271,7 +283,8 @@ void silk_A2NLSF(
 void silk_NLSF2A(
     opus_int16                  *a_Q12,             /* O    monic whitening filter coefficients in Q12,  [ d ]          */
     const opus_int16            *NLSF,              /* I    normalized line spectral frequencies in Q15, [ d ]          */
-    const opus_int              d                   /* I    filter order (should be even)                               */
+    const opus_int              d,                  /* I    filter order (should be even)                               */
+    int                         arch                /* I    Run-time architecture                                       */
 );
 
 void silk_insertion_sort_increasing(
diff --git a/silk/arm/LPC_inv_pred_gain_arm.h b/silk/arm/LPC_inv_pred_gain_arm.h
new file mode 100644
index 0000000..77d7167
--- /dev/null
+++ b/silk/arm/LPC_inv_pred_gain_arm.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(LPC_INV_PRED_GAIN_ARM_H)
+# define LPC_INV_PRED_GAIN_ARM_H
+
+# include "celt/arm/armcpu.h"
+
+# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+opus_int32 silk_LPC_inverse_pred_gain_neon(         /* O   Returns inverse prediction gain in energy domain, Q30        */
+    const opus_int16            *A_Q12,             /* I   Prediction coefficients, Q12 [order]                         */
+    const opus_int              order               /* I   Prediction order                                             */
+);
+# endif
+
+#  if !defined(OPUS_HAVE_RTCD)
+#   define OVERRIDE_silk_LPC_inverse_pred_gain      (1)
+#   define silk_LPC_inverse_pred_gain(A_Q12, order) ((void)(arch),PRESUME_NEON(silk_LPC_inverse_pred_gain)(A_Q12, order))
+#  endif
+
+#  if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)
+/*Is run-time CPU detection enabled on this platform?*/
+#   if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK+1])(const opus_int16 *A_Q12, const opus_int order);
+#    define OVERRIDE_silk_LPC_inverse_pred_gain      (1)
+#    define silk_LPC_inverse_pred_gain(A_Q12, order) ((*SILK_LPC_INVERSE_PRED_GAIN_IMPL[(arch)&OPUS_ARCHMASK])(A_Q12, order))
+#   elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#    define OVERRIDE_silk_LPC_inverse_pred_gain      (1)
+#    define silk_LPC_inverse_pred_gain(A_Q12, order) ((void)(arch),silk_LPC_inverse_pred_gain_neon(A_Q12, order))
+#   endif
+#  endif
+
+# if defined(FIXED_POINT)
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+opus_int32 silk_LPC_inverse_pred_gain_Q24_neon(     /* O    Returns inverse prediction gain in energy domain, Q30       */
+    const opus_int32            *A_Q24,             /* I    Prediction coefficients [order]                             */
+    const opus_int              order               /* I    Prediction order                                            */
+);
+#  endif
+
+#  if !defined(OPUS_HAVE_RTCD)
+#   define OVERRIDE_silk_LPC_inverse_pred_gain_Q24      (1)
+#   define silk_LPC_inverse_pred_gain_Q24(A_Q24, order) ((void)(arch),PRESUME_NEON(silk_LPC_inverse_pred_gain_Q24)(A_Q24, order))
+#  endif
+
+#  if !defined(OVERRIDE_silk_LPC_inverse_pred_gain_Q24)
+/*Is run-time CPU detection enabled on this platform?*/
+#    if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_Q24_IMPL[OPUS_ARCHMASK+1])(const opus_int32 *A_Q24, const opus_int order);
+#     define OVERRIDE_silk_LPC_inverse_pred_gain_Q24      (1)
+#     define silk_LPC_inverse_pred_gain_Q24(A_Q24, order) ((*SILK_LPC_INVERSE_PRED_GAIN_Q24_IMPL[(arch)&OPUS_ARCHMASK])(A_Q24, order))
+#    elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#     define OVERRIDE_silk_LPC_inverse_pred_gain_Q24      (1)
+#     define silk_LPC_inverse_pred_gain_Q24(A_Q24, order) ((void)(arch),silk_LPC_inverse_pred_gain_Q24_neon(A_Q24, order))
+#    endif
+#  endif
+
+# endif /* end FIXED_POINT */
+
+#endif /* end LPC_INV_PRED_GAIN_ARM_H */
diff --git a/silk/arm/LPC_inv_pred_gain_neon_intr.c b/silk/arm/LPC_inv_pred_gain_neon_intr.c
new file mode 100644
index 0000000..29f0e57
--- /dev/null
+++ b/silk/arm/LPC_inv_pred_gain_neon_intr.c
@@ -0,0 +1,258 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+   @file warped_autocorrelation_FIX_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for silk silk_warped_autocorrelation_FIX functions
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+#include <arm_neon.h>
+#include "stack_alloc.h"
+#include "main_FIX.h"
+
+#define QA                          24
+#define A_LIMIT                     SILK_FIX_CONST( 0.99975, QA )
+
+/* Compute inverse of LPC prediction gain, and                                                  */
+/* test if LPC coefficients are stable (all poles within unit circle)                           */
+/* Note that this NEON optimization doesn't guarantee identical A_QA buffer.                    */
+/* Since the intermediate A_QA buffer is never used again in the caller functions, that's fine. */
+static opus_int32 LPC_inverse_pred_gain_QA_neon(            /* O   Returns inverse prediction gain in energy domain, Q30    */
+    opus_int32           A_QA[ 2 ][ SILK_MAX_ORDER_LPC ],   /* I   Prediction coefficients                                  */
+    const opus_int       order                              /* I   Prediction order                                         */
+)
+{
+    opus_int   k, n, mult2Q;
+    opus_int32 invGain_Q30, rc_Q31, rc_mult1_Q30, rc_mult2;
+    opus_int32 *Aold_QA, *Anew_QA;
+
+    Anew_QA = A_QA[ order & 1 ];
+
+    invGain_Q30 = (opus_int32)1 << 30;
+    for( k = order - 1; k > 0; k-- ) {
+        int32x2_t rc_Q31_s32x2, rc_mult2_s32x2;
+        int64x2_t mult2Q_s64x2;
+
+        /* Check for stability */
+        if( ( Anew_QA[ k ] > A_LIMIT ) || ( Anew_QA[ k ] < -A_LIMIT ) ) {
+            return 0;
+        }
+
+        /* Set RC equal to negated AR coef */
+        rc_Q31 = -silk_LSHIFT( Anew_QA[ k ], 31 - QA );
+
+        /* rc_mult1_Q30 range: [ 1 : 2^30 ] */
+        rc_mult1_Q30 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
+        silk_assert( rc_mult1_Q30 > ( 1 << 15 ) );                   /* reduce A_LIMIT if fails */
+        silk_assert( rc_mult1_Q30 <= ( 1 << 30 ) );
+
+        /* rc_mult2 range: [ 2^30 : silk_int32_MAX ] */
+        mult2Q = 32 - silk_CLZ32( silk_abs( rc_mult1_Q30 ) );
+        rc_mult2 = silk_INVERSE32_varQ( rc_mult1_Q30, mult2Q + 30 );
+
+        /* Update inverse gain */
+        /* invGain_Q30 range: [ 0 : 2^30 ] */
+        invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );
+        silk_assert( invGain_Q30 >= 0           );
+        silk_assert( invGain_Q30 <= ( 1 << 30 ) );
+
+        /* Swap pointers */
+        Aold_QA = Anew_QA;
+        Anew_QA = A_QA[ k & 1 ];
+
+        /* Update AR coefficient */
+        rc_Q31_s32x2   = vdup_n_s32(rc_Q31);
+        mult2Q_s64x2   = vdupq_n_s64(-mult2Q);
+        rc_mult2_s32x2 = vdup_n_s32(rc_mult2);
+
+        for( n = 0; n < k; n += 4 ) {
+            /* We always calculate extra elements of A_QA buffer when (k % 4) != 0, to take the advantage of SIMD parallelization. */
+            int32x4_t Aold_QA_s32x4, Aold_QAr_s32x4, t_s32x4, tmp_QA_s32x4;
+            int64x2_t tmp0_s64x2, tmp1_s64x2;
+            Aold_QA_s32x4  = vld1q_s32(Aold_QA + n);
+            Aold_QAr_s32x4 = vld1q_s32(Aold_QA + k - n - 4);
+            Aold_QAr_s32x4 = vrev64q_s32(Aold_QAr_s32x4);
+            Aold_QAr_s32x4 = vcombine_s32(vget_high_s32(Aold_QAr_s32x4), vget_low_s32(Aold_QAr_s32x4)); // Compiler should generate VSWP.
+            t_s32x4        = vqrdmulhq_lane_s32(Aold_QAr_s32x4, rc_Q31_s32x2, 0);
+            tmp_QA_s32x4   = vsubq_s32(Aold_QA_s32x4, t_s32x4);
+            tmp0_s64x2     = vmull_s32(vget_low_s32 (tmp_QA_s32x4), rc_mult2_s32x2);
+            tmp1_s64x2     = vmull_s32(vget_high_s32(tmp_QA_s32x4), rc_mult2_s32x2);
+            tmp0_s64x2     = vrshlq_s64(tmp0_s64x2, mult2Q_s64x2);
+            tmp1_s64x2     = vrshlq_s64(tmp1_s64x2, mult2Q_s64x2);
+            t_s32x4        = vcombine_s32(vmovn_s64(tmp0_s64x2), vmovn_s64(tmp1_s64x2));
+            vst1q_s32(Anew_QA + n, t_s32x4);
+        }
+    }
+
+    /* Check for stability */
+    if( ( Anew_QA[ 0 ] > A_LIMIT ) || ( Anew_QA[ 0 ] < -A_LIMIT ) ) {
+        return 0;
+    }
+
+    /* Set RC equal to negated AR coef */
+    rc_Q31 = -silk_LSHIFT( Anew_QA[ 0 ], 31 - QA );
+
+    /* Range: [ 1 : 2^30 ] */
+    rc_mult1_Q30 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
+
+    /* Update inverse gain */
+    /* Range: [ 0 : 2^30 ] */
+    invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );
+    silk_assert( invGain_Q30 >= 0     );
+    silk_assert( invGain_Q30 <= 1<<30 );
+
+    return invGain_Q30;
+}
+
+/* For input in Q12 domain */
+opus_int32 silk_LPC_inverse_pred_gain_neon(         /* O   Returns inverse prediction gain in energy domain, Q30        */
+    const opus_int16            *A_Q12,             /* I   Prediction coefficients, Q12 [order]                         */
+    const opus_int              order               /* I   Prediction order                                             */
+)
+{
+    opus_int32 Atmp_QA[ 2 ][ SILK_MAX_ORDER_LPC ];
+    opus_int32 DC_resp = 0;
+    int16x8_t  t0_s16x8, t1_s16x8;
+    int32x4_t  t0_s32x4;
+
+    /* Increase Q domain of the AR coefficients */
+    silk_assert(!(order & 1)); // order is even
+    silk_assert(SILK_MAX_ORDER_LPC <= 16);
+    t0_s16x8 = vld1q_s16(A_Q12);
+    t1_s16x8 = vld1q_s16(A_Q12 + 8);
+    t0_s32x4 = vpaddlq_s16(t0_s16x8);
+    switch( order )
+    {
+    case 16:
+    {
+        int32x2_t t_s32x2;
+        int64x1_t t_s64x1;
+        t0_s32x4 = vpadalq_s16(t0_s32x4, t1_s16x8);
+        t_s32x2  = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4));
+        t_s64x1  = vpaddl_s32(t_s32x2);
+        DC_resp  = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0);
+    }
+    break;
+
+    case 14:
+    {
+        int64x1_t t_s64x1;
+        int32x4_t t1_s32x4 = vpaddlq_s16(t1_s16x8);
+        int32x2_t t_s32x2  = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4));
+        t_s32x2            = vadd_s32(t_s32x2, vget_low_s32(t1_s32x4));
+        t_s64x1            = vpaddl_s32(t_s32x2);
+        t_s64x1            = vreinterpret_s64_s32(vadd_s32(vreinterpret_s32_s64(t_s64x1), vget_high_s32(t1_s32x4)));
+        DC_resp            = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0);
+    }
+    break;
+
+    case 12:
+    {
+        int64x1_t t_s64x1;
+        int32x2_t t0_s32x2 = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4));
+        int32x2_t t1_s32x2 = vpaddl_s16(vget_low_s16(t1_s16x8));
+        t0_s32x2           = vadd_s32(t0_s32x2, t1_s32x2);
+        t_s64x1            = vpaddl_s32(t0_s32x2);
+        DC_resp            = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0);
+    }
+    break;
+
+    case 10:
+    {
+        int32x2_t t0_s32x2 = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4));
+        int32x2_t t1_s32x2 = vpaddl_s16(vget_low_s16(t1_s16x8));
+        int64x1_t t_s64x1  = vpaddl_s32(t0_s32x2);
+        t_s64x1            = vreinterpret_s64_s32(vadd_s32(vreinterpret_s32_s64(t_s64x1), t1_s32x2));
+        DC_resp            = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0);
+    }
+    break;
+
+    case 8:
+        DC_resp += (opus_int32)A_Q12[ 7 ];
+        DC_resp += (opus_int32)A_Q12[ 6 ];
+
+    case 6:
+        DC_resp += (opus_int32)A_Q12[ 5 ];
+        DC_resp += (opus_int32)A_Q12[ 4 ];
+
+    case 4:
+        DC_resp += (opus_int32)A_Q12[ 3 ];
+        DC_resp += (opus_int32)A_Q12[ 2 ];
+
+    case 2:
+        DC_resp += (opus_int32)A_Q12[ 1 ];
+        DC_resp += (opus_int32)A_Q12[ 0 ];
+
+    default:
+        break;
+    }
+
+    /* If the DC is unstable, we don't even need to do the full calculations */
+    if( DC_resp >= 4096 ) {
+        return 0;
+    }
+    vst1q_s32(Atmp_QA[ 0 ],      vshll_n_s16(vget_low_s16 (t0_s16x8), QA - 12));
+    vst1q_s32(Atmp_QA[ 0 ] +  4, vshll_n_s16(vget_high_s16(t0_s16x8), QA - 12));
+    vst1q_s32(Atmp_QA[ 0 ] +  8, vshll_n_s16(vget_low_s16 (t1_s16x8), QA - 12));
+    vst1q_s32(Atmp_QA[ 0 ] + 12, vshll_n_s16(vget_high_s16(t1_s16x8), QA - 12));
+
+    return LPC_inverse_pred_gain_QA_neon( Atmp_QA, order );
+}
+
+#ifdef FIXED_POINT
+
+/* For input in Q24 domain */
+opus_int32 silk_LPC_inverse_pred_gain_Q24_neon(     /* O    Returns inverse prediction gain in energy domain, Q30       */
+    const opus_int32            *A_Q24,             /* I    Prediction coefficients [order]                             */
+    const opus_int              order               /* I    Prediction order                                            */
+)
+{
+    opus_int32 Atmp_QA[ 2 ][ SILK_MAX_ORDER_LPC ];
+
+    /* Increase Q domain of the AR coefficients */
+    silk_assert(!(order & 1)); // order is even
+    silk_assert(SILK_MAX_ORDER_LPC == 16);
+    silk_assert(QA == 24); // No shift.
+    vst1q_s32(Atmp_QA[ 0 ],      vld1q_s32(A_Q24));
+    vst1q_s32(Atmp_QA[ 0 ] +  4, vld1q_s32(A_Q24 +  4));
+    vst1q_s32(Atmp_QA[ 0 ] +  8, vld1q_s32(A_Q24 +  8));
+    vst1q_s32(Atmp_QA[ 0 ] + 12, vld1q_s32(A_Q24 + 12));
+
+    return LPC_inverse_pred_gain_QA_neon( Atmp_QA, order );
+}
+
+#endif
diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c
index 2e330c4..59ceb6e 100644
--- a/silk/arm/arm_silk_map.c
+++ b/silk/arm/arm_silk_map.c
@@ -30,11 +30,21 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main_FIX.h"
 #include "NSQ.h"
+#include "SigProc_FIX.h"
 
 #if defined(OPUS_HAVE_RTCD)
 
-# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && \
- !defined(OPUS_ARM_PRESUME_NEON_INTR))
+# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+
+opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O   Returns inverse prediction gain in energy domain, Q30        */
+        const opus_int16            *A_Q12,                             /* I   Prediction coefficients, Q12 [order]                         */
+        const opus_int              order                               /* I   Prediction order                                             */
+) = {
+      silk_LPC_inverse_pred_gain_c,              /* ARMv4 */
+      silk_LPC_inverse_pred_gain_c,              /* EDSP */
+      silk_LPC_inverse_pred_gain_c,              /* Media */
+      MAY_HAVE_NEON(silk_LPC_inverse_pred_gain), /* Neon */
+};
 
 /*There is no table for silk_noise_shape_quantizer_short_prediction because the
    NEON version takes different parameters than the C version.
@@ -56,6 +66,16 @@ opus_int32
 #if defined(FIXED_POINT) && \
  defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
 
+opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_Q24_IMPL[OPUS_ARCHMASK + 1])( /* O    Returns inverse prediction gain in energy domain, Q30       */
+        const opus_int32            *A_Q24,                                 /* I    Prediction coefficients [order]                             */
+        const opus_int              order                                   /* I    Prediction order                                            */
+) = {
+      silk_LPC_inverse_pred_gain_Q24_c,              /* ARMv4 */
+      silk_LPC_inverse_pred_gain_Q24_c,              /* EDSP */
+      silk_LPC_inverse_pred_gain_Q24_c,              /* Media */
+      MAY_HAVE_NEON(silk_LPC_inverse_pred_gain_Q24), /* Neon */
+};
+
 void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])(
           opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
           opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
diff --git a/silk/decode_parameters.c b/silk/decode_parameters.c
index e345b1d..a56a409 100644
--- a/silk/decode_parameters.c
+++ b/silk/decode_parameters.c
@@ -52,7 +52,7 @@ void silk_decode_parameters(
     silk_NLSF_decode( pNLSF_Q15, psDec->indices.NLSFIndices, psDec->psNLSF_CB );
 
     /* Convert NLSF parameters to AR prediction filter coefficients */
-    silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order );
+    silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order, psDec->arch );
 
     /* If just reset, e.g., because internal Fs changed, do not allow interpolation */
     /* improves the case of packet loss in the first frame after a switch           */
@@ -69,7 +69,7 @@ void silk_decode_parameters(
         }
 
         /* Convert NLSF parameters to AR prediction filter coefficients */
-        silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order );
+        silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order, psDec->arch );
     } else {
         /* Copy LPC coefficients for first half from second half */
         silk_memcpy( psDecCtrl->PredCoef_Q12[ 0 ], psDecCtrl->PredCoef_Q12[ 1 ], psDec->LPC_order * sizeof( opus_int16 ) );
diff --git a/silk/fixed/find_LPC_FIX.c b/silk/fixed/find_LPC_FIX.c
index e11cdc8..e55b63a 100644
--- a/silk/fixed/find_LPC_FIX.c
+++ b/silk/fixed/find_LPC_FIX.c
@@ -92,7 +92,7 @@ void silk_find_LPC_FIX(
             silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder );
 
             /* Convert to LPC for residual energy evaluation */
-            silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder );
+            silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder, psEncC->arch );
 
             /* Calculate residual energy with NLSF interpolation */
             silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder, psEncC->arch );
diff --git a/silk/float/find_LPC_FLP.c b/silk/float/find_LPC_FLP.c
index fcfe1c3..4d63964 100644
--- a/silk/float/find_LPC_FLP.c
+++ b/silk/float/find_LPC_FLP.c
@@ -73,7 +73,7 @@ void silk_find_LPC_FLP(
             silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder );
 
             /* Convert to LPC for residual energy evaluation */
-            silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder );
+            silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder, psEncC->arch );
 
             /* Calculate residual energy with LSF interpolation */
             silk_LPC_analysis_filter_FLP( LPC_res, a_tmp, x, 2 * subfr_length, psEncC->predictLPCOrder );
diff --git a/silk/float/main_FLP.h b/silk/float/main_FLP.h
index e5a7597..c2105a5 100644
--- a/silk/float/main_FLP.h
+++ b/silk/float/main_FLP.h
@@ -285,7 +285,8 @@ void silk_A2NLSF_FLP(
 void silk_NLSF2A_FLP(
     silk_float                      *pAR,                               /* O    LPC coefficients [ LPC_order ]              */
     const opus_int16                *NLSF_Q15,                          /* I    NLSF vector      [ LPC_order ]              */
-    const opus_int                  LPC_order                           /* I    LPC order                                   */
+    const opus_int                  LPC_order,                          /* I    LPC order                                   */
+    int                             arch                                /* I    Run-time architecture                       */
 );
 
 /* Limit, stabilize, and quantize NLSFs */
diff --git a/silk/float/wrappers_FLP.c b/silk/float/wrappers_FLP.c
index 6666b8e..53a556e 100644
--- a/silk/float/wrappers_FLP.c
+++ b/silk/float/wrappers_FLP.c
@@ -54,13 +54,14 @@ void silk_A2NLSF_FLP(
 void silk_NLSF2A_FLP(
     silk_float                      *pAR,                               /* O    LPC coefficients [ LPC_order ]              */
     const opus_int16                *NLSF_Q15,                          /* I    NLSF vector      [ LPC_order ]              */
-    const opus_int                  LPC_order                           /* I    LPC order                                   */
+    const opus_int                  LPC_order,                          /* I    LPC order                                   */
+    int                             arch                                /* I    Run-time architecture                       */
 )
 {
     opus_int   i;
     opus_int16 a_fix_Q12[ MAX_LPC_ORDER ];
 
-    silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order );
+    silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order, arch );
 
     for( i = 0; i < LPC_order; i++ ) {
         pAR[ i ] = ( silk_float )a_fix_Q12[ i ] * ( 1.0f / 4096.0f );
diff --git a/silk/init_decoder.c b/silk/init_decoder.c
index f887c67..16c03dc 100644
--- a/silk/init_decoder.c
+++ b/silk/init_decoder.c
@@ -44,6 +44,7 @@ opus_int silk_init_decoder(
     /* Used to deactivate LSF interpolation */
     psDec->first_frame_after_reset = 1;
     psDec->prev_gain_Q16 = 65536;
+    psDec->arch = opus_select_arch();
 
     /* Reset CNG state */
     silk_CNG_Reset( psDec );
diff --git a/silk/process_NLSFs.c b/silk/process_NLSFs.c
index 0ab71f0..2f10f8d 100644
--- a/silk/process_NLSFs.c
+++ b/silk/process_NLSFs.c
@@ -89,7 +89,7 @@ void silk_process_NLSFs(
         NLSF_mu_Q20, psEncC->NLSF_MSVQ_Survivors, psEncC->indices.signalType );
 
     /* Convert quantized NLSFs back to LPC coefficients */
-    silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder );
+    silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder, psEncC->arch );
 
     if( doInterpolate ) {
         /* Calculate the interpolated, quantized LSF vector for the first half */
@@ -97,7 +97,7 @@ void silk_process_NLSFs(
             psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder );
 
         /* Convert back to LPC coefficients */
-        silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder );
+        silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder, psEncC->arch );
 
     } else {
         /* Copy LPC coefficients for first half from second half */
diff --git a/silk/structs.h b/silk/structs.h
index 827829d..b68e4c9 100644
--- a/silk/structs.h
+++ b/silk/structs.h
@@ -301,6 +301,7 @@ typedef struct {
     /* Stuff used for PLC */
     opus_int                    lossCnt;
     opus_int                    prevSignalType;
+    int                         arch;
 
     silk_PLC_struct sPLC;
 
diff --git a/silk/tests/test_unit_optimization_LPC_inv_pred_gain.c b/silk/tests/test_unit_optimization_LPC_inv_pred_gain.c
new file mode 100644
index 0000000..e98f3f6
--- /dev/null
+++ b/silk/tests/test_unit_optimization_LPC_inv_pred_gain.c
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "SigProc_FIX.h"
+
+static int test_silk_LPC_inverse_pred_gain(int arch)
+{
+    unsigned int i;
+    opus_int     order;
+    opus_int16   A_Q12[ SILK_MAX_ORDER_LPC ];
+    opus_int32   rtn_org, rtn_opt;
+    (void)arch;
+
+    printf("%50s", "LPC_inverse_pred_gain() ...");
+    for( order = 2; order <= SILK_MAX_ORDER_LPC; order += 2 ) // order must be even.
+    {
+        for (unsigned int shift = 0; shift < 16; shift++) // Test dynamic range.
+        {
+            for (i = 0; i < SILK_MAX_ORDER_LPC; i++)
+            {
+                A_Q12[i] = ((opus_int16)rand()) >> shift;
+            }
+
+            rtn_org = silk_LPC_inverse_pred_gain_c(A_Q12, order);
+            rtn_opt = silk_LPC_inverse_pred_gain  (A_Q12, order);
+            if ((rtn_org != rtn_opt))
+            {
+                printf("order=%2d failed!\n", order);
+                printf("rtn_org=%d rtn_opt=%d!\n", rtn_org, rtn_opt);
+                return -1;
+            }
+        }
+    }
+    printf(" passed!\n");
+    return 0;
+}
+
+#ifdef FIXED_POINT
+
+static int test_silk_LPC_inverse_pred_gain_Q24(int arch)
+{
+    unsigned int i;
+    opus_int     order;
+    opus_int32   A_Q24[ SILK_MAX_ORDER_LPC ];
+    opus_int32   rtn_org, rtn_opt;
+    (void)arch;
+
+    printf("%50s", "LPC_inverse_pred_gain_Q24() ...");
+    for( order = 2; order <= SILK_MAX_ORDER_LPC; order += 2 ) // order must be even.
+    {
+        for (unsigned int shift = 0; shift < 31; shift++) // Test dynamic range.
+        {
+            for (i = 0; i < SILK_MAX_ORDER_LPC; i++)
+            {
+                A_Q24[i] = ((opus_int32)rand()) >> shift;
+            }
+
+            rtn_org = silk_LPC_inverse_pred_gain_Q24_c(A_Q24, order);
+            rtn_opt = silk_LPC_inverse_pred_gain_Q24  (A_Q24, order);
+            if ((rtn_org != rtn_opt))
+            {
+                printf("order=%2d failed!\n", order);
+                printf("rtn_org=%d rtn_opt=%d!\n", rtn_org, rtn_opt);
+                return -1;
+            }
+        }
+    }
+    printf(" passed!\n");
+    return 0;
+}
+
+#endif /* FIXED_POINT */
diff --git a/silk_headers.mk b/silk_headers.mk
index 52c42d0..ca9bf27 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -22,6 +22,7 @@ silk/resampler_rom.h \
 silk/resampler_structs.h \
 silk/SigProc_FIX.h \
 silk/x86/SigProc_FIX_sse.h \
+silk/arm/LPC_inv_pred_gain_arm.h \
 silk/arm/macros_armv4.h \
 silk/arm/macros_armv5e.h \
 silk/arm/macros_arm64.h \
diff --git a/silk_sources.mk b/silk_sources.mk
index 5f9551b..d8323df 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -84,6 +84,7 @@ silk/x86/VQ_WMat_EC_sse.c
 
 SILK_SOURCES_ARM_NEON_INTR = \
 silk/arm/arm_silk_map.c \
+silk/arm/LPC_inv_pred_gain_neon_intr.c \
 silk/arm/NSQ_neon.c
 
 SILK_SOURCES_FIXED = \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
index b5c25d9..8e90074 100644
--- a/tests/test_unit_optimization.c
+++ b/tests/test_unit_optimization.c
@@ -29,6 +29,7 @@
 #endif
 
 #include <stdio.h>
+#include "cpu_support.h"
 #include "stack_alloc.h"
 
 #define SKIP_CONFIG_H
@@ -44,21 +45,25 @@
 
 #endif
 
+# include "silk/tests/test_unit_optimization_LPC_inv_pred_gain.c"
+
 int main(void)
 {
    int result = 0; /* 0: passed; other: failed */
    ALLOC_STACK;
-#ifdef FIXED_POINT
    int arch = opus_select_arch();
-#endif /* FIXED_POINT */
    int count = 10;
 
+   srand(0);
+
    while (!result && count--) {
       printf("\n--------------------------- Testing optimization ---------------------------\n");
 #ifdef FIXED_POINT
       result |= test_fir(arch);
+      result |= test_silk_LPC_inverse_pred_gain_Q24(arch);
       result |= test_warped_autocorrelation(arch);
 #endif /* FIXED_POINT */
+      result |= test_silk_LPC_inverse_pred_gain(arch);
    }
    return result;
 }
-- 
2.8.0.rc3.226.g39d4020