[opus] [PATCH] Optimize silk_warped_autocorrelation_FIX() for ARM NEON

Fri Jul 1 16:25:57 UTC 2016

Create silk_warped_autocorrelation_FIX_c_opt() which unrolls and parallelizes
input by 8. It has very long prolog and epilog, but this is the cost to get
good speed on this heavily hit function.
This function may be the code base for optimization on different CPUs.
Create ARM NEON intrinsics optimization silk_warped_autocorrelation_FIX_neon().
Create test tests/test_unit_optimization to unit test optimizations.
---
 .gitignore                                         |   2 +-
 Makefile.am                                        |  44 +-
 celt/tests/test_unit_optimization_lpc.c            |  56 +--
 silk/arm/arm_silk_map.c                            |  20 +
 silk/arm/warped_autocorrelation_FIX_arm.h          |  65 +++
 silk/arm/warped_autocorrelation_FIX_neon_intr.c    | 495 +++++++++++++++++++++
 silk/fixed/main_FIX.h                              |  15 +-
 .../fixed/mips/warped_autocorrelation_FIX_mipsr1.h |   6 -
 silk/fixed/warped_autocorrelation_FIX.c            |   7 +-
 ...test_unit_optimization_warped_autocorrelation.c | 441 ++++++++++++++++++
 silk_headers.mk                                    |   1 +
 silk_sources.mk                                    |   3 +-
 tests/test_unit_optimization.c                     |  64 +++
 13 files changed, 1153 insertions(+), 66 deletions(-)
 create mode 100644 silk/arm/warped_autocorrelation_FIX_arm.h
 create mode 100644 silk/arm/warped_autocorrelation_FIX_neon_intr.c
 create mode 100644 silk/tests/test_unit_optimization_warped_autocorrelation.c
 create mode 100644 tests/test_unit_optimization.c

diff --git a/.gitignore b/.gitignore
index 9e824af..05d0582 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,7 @@ tests/test_opus_api
 tests/test_opus_decode
 tests/test_opus_encode
 tests/test_opus_padding
+tests/test_unit_optimization
 celt/arm/armopts.s
 celt/dump_modes/dump_modes
 celt/tests/test_unit_cwrs32
@@ -57,7 +58,6 @@ celt/tests/test_unit_entropy
 celt/tests/test_unit_laplace
 celt/tests/test_unit_mathops
 celt/tests/test_unit_mdct
-celt/tests/test_unit_optimization_lpc
 celt/tests/test_unit_rotation
 celt/tests/test_unit_types
 doc/doxygen_sqlite3.db
diff --git a/Makefile.am b/Makefile.am
index 08d26c5..2bfb923 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -84,9 +84,36 @@ pkginclude_HEADERS = include/opus.h include/opus_multistream.h include/opus_type
 noinst_HEADERS = $(OPUS_HEAD) $(SILK_HEAD) $(CELT_HEAD)
 
 if EXTRA_PROGRAMS
-noinst_PROGRAMS = opus_demo repacketizer_demo opus_compare tests/test_opus_api tests/test_opus_encode tests/test_opus_decode tests/test_opus_padding celt/tests/test_unit_cwrs32 celt/tests/test_unit_dft celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_mathops celt/tests/test_unit_mdct celt/tests/test_unit_optimization_lpc celt/tests/test_unit_rotation celt/tests/test_unit_types
-
-TESTS = celt/tests/test_unit_types celt/tests/test_unit_mathops celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_dft celt/tests/test_unit_mdct celt/tests/test_unit_optimization_lpc celt/tests/test_unit_rotation celt/tests/test_unit_cwrs32 tests/test_opus_api tests/test_opus_decode tests/test_opus_encode tests/test_opus_padding
+noinst_PROGRAMS = opus_demo \
+                  repacketizer_demo \
+                  opus_compare \
+                  celt/tests/test_unit_cwrs32 \
+                  celt/tests/test_unit_dft \
+                  celt/tests/test_unit_entropy \
+                  celt/tests/test_unit_laplace \
+                  celt/tests/test_unit_mathops \
+                  celt/tests/test_unit_mdct \
+                  celt/tests/test_unit_rotation \
+                  celt/tests/test_unit_types \
+                  tests/test_opus_api \
+                  tests/test_opus_encode \
+                  tests/test_opus_decode \
+                  tests/test_opus_padding \
+                  tests/test_unit_optimization
+
+TESTS = celt/tests/test_unit_types \
+        celt/tests/test_unit_mathops \
+        celt/tests/test_unit_entropy \
+        celt/tests/test_unit_laplace \
+        celt/tests/test_unit_dft \
+        celt/tests/test_unit_mdct \
+        celt/tests/test_unit_rotation \
+        celt/tests/test_unit_cwrs32 \
+        tests/test_opus_api \
+        tests/test_opus_decode \
+        tests/test_opus_encode \
+        tests/test_opus_padding \
+        tests/test_unit_optimization
 
 opus_demo_SOURCES = src/opus_demo.c
 
@@ -111,6 +138,9 @@ tests_test_opus_decode_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 tests_test_opus_padding_SOURCES = tests/test_opus_padding.c tests/test_opus_common.h
 tests_test_opus_padding_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
 
+tests_test_unit_optimization_SOURCES = tests/test_unit_optimization.c
+tests_test_unit_optimization_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
+
 celt_tests_test_unit_cwrs32_SOURCES = celt/tests/test_unit_cwrs32.c
 celt_tests_test_unit_cwrs32_LDADD = $(LIBM)
 
@@ -138,12 +168,6 @@ if OPUS_ARM_EXTERNAL_ASM
 celt_tests_test_unit_mdct_LDADD += libarmasm.la
 endif
 
-celt_tests_test_unit_optimization_lpc_SOURCES = celt/tests/test_unit_optimization_lpc.c
-celt_tests_test_unit_optimization_lpc_LDADD = $(NE10_LIBS) $(LIBM)
-if OPUS_ARM_EXTERNAL_ASM
-celt_tests_test_unit_optimization_lpc_LDADD += libarmasm.la
-endif
-
 celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c
 celt_tests_test_unit_rotation_LDADD = $(NE10_LIBS) $(LIBM)
 if OPUS_ARM_EXTERNAL_ASM
@@ -283,7 +307,7 @@ OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \
                     $(celt_tests_test_unit_rotation_SOURCES:.c=.o) \
                     $(celt_tests_test_unit_mdct_SOURCES:.c=.o) \
                     $(celt_tests_test_unit_dft_SOURCES:.c=.o) \
-                    $(celt_tests_test_unit_optimization_lpc_SOURCES:.c=.o)
+                    $(tests_test_unit_optimization_SOURCES:.c=.o)
 
 if HAVE_SSE
 SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo)
diff --git a/celt/tests/test_unit_optimization_lpc.c b/celt/tests/test_unit_optimization_lpc.c
index 903c838..7247046 100644
--- a/celt/tests/test_unit_optimization_lpc.c
+++ b/celt/tests/test_unit_optimization_lpc.c
@@ -25,56 +25,39 @@
 */
 
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+# include "config.h"
 #endif
 
 #define SKIP_CONFIG_H
 
 #ifndef CUSTOM_MODES
-#define CUSTOM_MODES
+# define CUSTOM_MODES
 #endif
 
 #include <stdio.h>
+#include <string.h>
 
-#define CELT_C
-#include "stack_alloc.h"
-#include "mathops.c"
-#include "entcode.c"
+#ifndef CELT_C
+# define CELT_C
+#endif
+#include "celt_lpc.h"
+#include "modes.h"
 
 #ifdef FIXED_POINT
 
-#if defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
-# include "celt_lpc.c"
-# include "pitch.c"
-# include "x86/x86cpu.c"
-# include "x86/celt_lpc_sse.c"
-# include "x86/pitch_sse2.c"
-# include "x86/pitch_sse4_1.c"
-# include "x86/x86_celt_map.c"
-#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
-# include "celt_lpc.c"
-# include "pitch.c"
-# include "arm/armcpu.c"
-# include "arm/arm_celt_map.c"
-# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
-#  include "arm/celt_lpc_neon_intr.c"
-#  include "arm/celt_neon_intr.c"
-# endif
-#endif
-
 #define MAX_ORDER 32
 
-void test_fir(int arch)
+static int test_fir(int arch)
 {
    opus_val16 x[MAX_PERIOD+MAX_ORDER];
    opus_val16 num[MAX_ORDER];
    opus_val16 yorg[MAX_PERIOD], yopt[MAX_PERIOD];
    int N, ord;
-
    unsigned int i;
+
+   printf("%50s", "celt_fir() ...");
    for(ord=0;ord<=MAX_ORDER;ord++)
    {
-      printf("ord=%2d", ord);
       for(N=ord;N<=MAX_PERIOD;N++) /* N is larger than or equal to ord. */
       {
          for (i=0;i<MAX_PERIOD+MAX_ORDER;++i)
@@ -95,7 +78,7 @@ void test_fir(int arch)
          celt_fir  (x+MAX_ORDER, num, yopt, N, ord, arch);
          if (memcmp(yorg, yopt, sizeof(yorg)))
          {
-            printf(" N=%3d failed!\nError in lpc unit test!!!\n", N);
+            printf("ord=%2d N=%3d failed!\nError in lpc unit test!!!\n", ord, N);
             for (i=0;i<sizeof(yorg) / sizeof(*yorg);i++)
             {
                if (yorg[i] != yopt[i])
@@ -103,20 +86,11 @@ void test_fir(int arch)
                   printf("yorg[%3d]=%d, yopt[%3d]=%d\n", i, yorg[i], i, yopt[i]);
                }
             }
-            exit(0);
+            return -1;
          }
       }
-      printf(" passed!\n");
    }
-}
-#endif /* FIXED_POINT */
-
-int main(void)
-{
-#ifdef FIXED_POINT
-   ALLOC_STACK;
-   int arch = opus_select_arch();
-   test_fir(arch);
-#endif /* FIXED_POINT */
+   printf(" passed!\n");
    return 0;
 }
+#endif /* FIXED_POINT */
diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c
index 9bd86a7..49fd672 100644
--- a/silk/arm/arm_silk_map.c
+++ b/silk/arm/arm_silk_map.c
@@ -28,6 +28,7 @@ POSSIBILITY OF SUCH DAMAGE.
 # include "config.h"
 #endif
 
+#include "main_FIX.h"
 #include "NSQ.h"
 
 #if defined(OPUS_HAVE_RTCD)
@@ -52,4 +53,23 @@ opus_int32
 
 # endif
 
+#if defined(FIXED_POINT) && \
+ defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+
+void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])(
+          opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
+          opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
+    const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
+    const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
+    const opus_int                  length,                                 /* I    Length of input                                                             */
+    const opus_int                  order                                   /* I    Correlation order (even)                                                    */
+) = {
+      silk_warped_autocorrelation_FIX_c,    /* ARMv4 */
+      silk_warped_autocorrelation_FIX_c,    /* EDSP */
+      silk_warped_autocorrelation_FIX_c,    /* Media */
+      silk_warped_autocorrelation_FIX_neon, /* Neon */
+};
+
+#endif
+
 #endif /* OPUS_HAVE_RTCD */
diff --git a/silk/arm/warped_autocorrelation_FIX_arm.h b/silk/arm/warped_autocorrelation_FIX_arm.h
new file mode 100644
index 0000000..ee892bf
--- /dev/null
+++ b/silk/arm/warped_autocorrelation_FIX_arm.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(WARPED_AUTOCORRELATION_FIX_ARM_H)
+# define WARPED_AUTOCORRELATION_FIX_ARM_H
+
+# include "celt/arm/armcpu.h"
+
+# if defined(FIXED_POINT)
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+void silk_warped_autocorrelation_FIX_neon(
+          opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
+          opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
+    const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
+    const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
+    const opus_int                  length,                                 /* I    Length of input                                                             */
+    const opus_int                  order                                   /* I    Correlation order (even)                                                    */
+);
+#  endif
+
+#  if !defined(OPUS_HAVE_RTCD)
+#   define OVERRIDE_silk_warped_autocorrelation_FIX (1)
+#   define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order) \
+  ((void)(arch),PRESUME_NEON(silk_warped_autocorrelation_FIX)(corr, scale, input, warping_Q16, length, order))
+#  endif
+
+#if !defined(OVERRIDE_silk_warped_autocorrelation_FIX)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_ASM) \
+   || (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) \
+   && !defined(OPUS_ARM_PRESUME_NEON_INTR)))
+extern void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK+1])(opus_int32*, opus_int*, const opus_int16*, const opus_int, const opus_int, const opus_int);
+
+#  define OVERRIDE_silk_warped_autocorrelation_FIX
+#   define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order) \
+  ((*SILK_WARPED_AUTOCORRELATION_FIX_IMPL[(arch)&OPUS_ARCHMASK])(corr, scale, input, warping_Q16, length, order))
+# endif
+#endif
+#endif /* end FIXED_POINT */
+
+#endif /* end WARPED_AUTOCORRELATION_FIX_ARM_H */
diff --git a/silk/arm/warped_autocorrelation_FIX_neon_intr.c b/silk/arm/warped_autocorrelation_FIX_neon_intr.c
new file mode 100644
index 0000000..80dd949
--- /dev/null
+++ b/silk/arm/warped_autocorrelation_FIX_neon_intr.c
@@ -0,0 +1,495 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+   @file warped_autocorrelation_FIX_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for silk silk_warped_autocorrelation_FIX functions
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+
+#include <arm_neon.h>
+#include "stack_alloc.h"
+#include "main_FIX.h"
+
+#ifdef FIXED_POINT
+
+#define NUM_PARALLEL_INPUTS 8
+
+void silk_warped_autocorrelation_FIX_neon(
+         opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
+         opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
+   const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
+   const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
+   const opus_int                  length,                                 /* I    Length of input                                                             */
+   const opus_int                  order                                   /* I    Correlation order (even)                                                    */
+)
+{
+   opus_int   n = 0, i, lsh;
+   opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS];
+   opus_int32 input_QS[NUM_PARALLEL_INPUTS];
+   opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two extra entries.
+   opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in the last prolog and the last inner loop, and one extra end entry in the last prolog.
+   opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+
+   /* Order must be even */
+   silk_assert( ( order & 1 ) == 0 );
+   silk_assert( 2 * QS - QC >= 0 );
+
+   /* Loop over samples */
+   if( order >= NUM_PARALLEL_INPUTS - 2 ) {
+      const int32x2_t warping_Q16_s32 = vdup_n_s32(warping_Q16);
+      for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n += NUM_PARALLEL_INPUTS ) {
+         int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS0_s32x4, tmp2_QS1_s32x4;
+         int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, corr_QC2_s64x2, corr_QC3_s64x2;
+         int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2;
+         int32x2_t tmp1_QS_s32x2, tmp2_QS_s32x2;
+         int64x1_t corr_QC_s64x1;
+         const int32x4_t input_QS0_s32x4 = vshll_n_s16(vld1_s16(input + n),     QS);
+         const int32x4_t input_QS1_s32x4 = vshll_n_s16(vld1_s16(input + n + 4), QS);
+         vst1q_s32(tmp1_QS,      input_QS0_s32x4);
+         vst1q_s32(tmp1_QS  + 4, input_QS1_s32x4);
+
+         /* Loop over allpass sections */
+
+         /* -------------------- prolog 0 -------------------- */
+
+         tmp1_QS_s32x2  = vget_low_s32(input_QS0_s32x4);
+         tmp2_QS_s32x2  = vld1_s32(state_QS + order); // Accessed one extra end entry.
+         vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
+         corr_QC_s64x1  = vld1_s64(corr_QC + order);
+         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64(corr_QC_s64x1, vget_low_s64(t0_s64x2));
+         vst1_s64(corr_QC + order, corr_QC_s64x1);
+         tmp1_QS_s32x2  = vsub_s32(vld1_s32(state_QS + order - 1), tmp1_QS_s32x2);
+         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t0_s64x2, 16);
+         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+         tmp1_QS_s32x2  = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1);
+
+         /* -------------------- prolog 1 -------------------- */
+
+         tmp2_QS_s32x2  = vld1_s32(state_QS + order - 1);
+         vst1_s32(state_QS + order - 1, tmp1_QS_s32x2);
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 1);
+         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         vst1q_s64(corr_QC + order - 1, corr_QC0_s64x2);
+         tmp1_QS_s32x2  = vsub_s32(vld1_s32(state_QS + order - 2), tmp1_QS_s32x2);
+         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t0_s64x2, 16);
+         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+         tmp1_QS0_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4));
+
+         /* -------------------- prolog 2 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry.
+         vst1q_s32(state_QS + order - 2, tmp1_QS0_s32x4);  // Saving one extra entry is OK.
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 2);
+         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t1_s64x2));
+         vst1q_s64(corr_QC + order - 2, corr_QC0_s64x2);
+         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS0_s32x4);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS0_s32x4 = vld1q_lane_s32(tmp1_QS + 3, tmp1_QS0_s32x4, 3);
+
+         /* -------------------- prolog 3 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 3);
+         vst1q_s32(state_QS + order - 3, tmp1_QS0_s32x4);
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 3);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 1);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         vst1q_s64(corr_QC + order - 3, corr_QC0_s64x2);
+         vst1q_s64(corr_QC + order - 1, corr_QC1_s64x2);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 4), tmp1_QS0_s32x4);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS_s32x2  = vget_low_s32(input_QS1_s32x4);
+
+         /* -------------------- prolog 4 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 4);
+         tmp2_QS_s32x2  = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0);
+         vst1q_s32(state_QS + order - 4, tmp1_QS0_s32x4);
+         vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 4);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 2);
+         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_low_s32 (input_QS1_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t2_s64x2));
+         vst1q_s64(corr_QC + order - 4, corr_QC0_s64x2);
+         vst1q_s64(corr_QC + order - 2, corr_QC1_s64x2);
+         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 5), tmp1_QS0_s32x4);
+         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS + order - 1), tmp1_QS_s32x2);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS_s32x2  = vshrn_n_s64(t2_s64x2, 16);
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp1_QS_s32x2  = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1);
+
+         /* -------------------- prolog 5 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 5);
+         tmp2_QS_s32x2  = vld1_s32 (state_QS + order - 1);
+         vst1q_s32(state_QS + order - 5, tmp1_QS0_s32x4);
+         vst1_s32 (state_QS + order - 1, tmp1_QS_s32x2);
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 5);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 3);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 1);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_low_s32 (input_QS1_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         vst1q_s64(corr_QC + order - 5, corr_QC0_s64x2);
+         vst1q_s64(corr_QC + order - 3, corr_QC1_s64x2);
+         vst1q_s64(corr_QC + order - 1, corr_QC2_s64x2);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 6), tmp1_QS0_s32x4);
+         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS + order - 2), tmp1_QS_s32x2);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS_s32x2  = vshrn_n_s64(t2_s64x2, 16);
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp1_QS1_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4));
+
+         /* -------------------- prolog 6 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 6);
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry.
+         vst1q_s32(state_QS + order - 6, tmp1_QS0_s32x4);
+         vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4);  // Saving one extra entry is OK.
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 6);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 4);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 2);
+         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t3_s64x2));
+         vst1q_s64(corr_QC + order - 6, corr_QC0_s64x2);
+         vst1q_s64(corr_QC + order - 4, corr_QC1_s64x2);
+         vst1q_s64(corr_QC + order - 2, corr_QC2_s64x2);
+         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 7), tmp1_QS0_s32x4); // Accessed one extra head entry when order is 6.
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS1_s32x4);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         tmp1_QS1_s32x4 = vld1q_lane_s32(tmp1_QS + 7, tmp1_QS1_s32x4, 3);
+
+         /* -------------------- kernel loop -------------------- */
+
+         for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) {
+             /* Output of allpass section */
+            tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1);
+            tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5);
+            vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1, tmp1_QS0_s32x4);
+            vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5, tmp1_QS1_s32x4);
+            corr_QC0_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1);
+            corr_QC1_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3);
+            corr_QC2_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5);
+            corr_QC3_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7);
+            t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+            t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+            t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+            t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+            t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+            t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+            t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+            t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+            corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+            corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+            corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+            corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1, corr_QC0_s64x2);
+            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3, corr_QC1_s64x2);
+            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5, corr_QC2_s64x2);
+            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7, corr_QC3_s64x2);
+            tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS),     tmp1_QS0_s32x4); // Accessed one extra head entry in the last loop.
+            tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 4), tmp1_QS1_s32x4);
+            t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+            t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+            t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+            t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+            tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+            tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+            tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+            tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         }
+
+         /* -------------------- epilog 0 -------------------- */
+
+         tmp2_QS_s32x2  = vld1_s32(state_QS + 1);
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 3);
+         vst1q_s32(state_QS - 1, tmp1_QS0_s32x4);  // Saving one extra entry is OK.
+         vst1q_s32(state_QS + 3, tmp1_QS1_s32x4);
+         corr_QC_s64x1  = vld1_s64 (corr_QC);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + 1);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + 3);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 5);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t0_s64x2));
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1_s64 (corr_QC + 0, corr_QC_s64x1);
+         vst1q_s64(corr_QC + 1, corr_QC1_s64x2);
+         vst1q_s64(corr_QC + 3, corr_QC2_s64x2);
+         vst1q_s64(corr_QC + 5, corr_QC3_s64x2);
+         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS),     vget_high_s32(tmp1_QS0_s32x4));
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 2), tmp1_QS1_s32x4);
+         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                 warping_Q16_s32);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+         /* -------------------- epilog 1 -------------------- */
+
+         tmp2_QS_s32x2  = vld1_s32 (state_QS);
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 2);
+         vst1_s32 (state_QS,     tmp1_QS_s32x2);
+         vst1q_s32(state_QS + 2, tmp1_QS1_s32x4);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + 0);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + 2);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 4);
+         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1q_s64(corr_QC + 0, corr_QC1_s64x2);
+         vst1q_s64(corr_QC + 2, corr_QC2_s64x2);
+         vst1q_s64(corr_QC + 4, corr_QC3_s64x2);
+         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS - 1), tmp1_QS_s32x2); // Accessed one extra head entry.
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 1), tmp1_QS1_s32x4);
+         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                 warping_Q16_s32);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+         /* -------------------- epilog 2 -------------------- */
+
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 1);
+         vst1_lane_s32(state_QS,     tmp1_QS_s32x2, 1);
+         vst1q_s32    (state_QS + 1, tmp1_QS1_s32x4);
+         corr_QC_s64x1  = vld1_s64(corr_QC);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + 1);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 3);
+         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t1_s64x2));
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1_s64 (corr_QC + 0, corr_QC_s64x1);
+         vst1q_s64(corr_QC + 1, corr_QC2_s64x2);
+         vst1q_s64(corr_QC + 3, corr_QC3_s64x2);
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS), tmp1_QS1_s32x4);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+         /* -------------------- epilog 3 -------------------- */
+
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS);
+         vst1q_s32(state_QS, tmp1_QS1_s32x4);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 2);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1q_s64(corr_QC,     corr_QC2_s64x2);
+         vst1q_s64(corr_QC + 2, corr_QC3_s64x2);
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS - 1), tmp1_QS1_s32x4); // Accessed one extra head entry.
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+         /* -------------------- epilog 4 -------------------- */
+
+         corr_QC_s64x1  = vld1_s64 (corr_QC);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 1);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t2_s64x2));
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1_s64 (corr_QC,     corr_QC_s64x1);
+         vst1q_s64(corr_QC + 1, corr_QC3_s64x2);
+         vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4);
+
+         tmp2_QS_s32x2  = vld1_s32(state_QS + 1);
+         tmp1_QS_s32x2  = vsub_s32(vld1_s32(tmp1_QS + 5), vget_high_s32(tmp1_QS1_s32x4));
+         t3_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t3_s64x2, 16);
+         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+         vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1);
+
+         /* -------------------- epilog 5 & 6 -------------------- */
+
+         vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1);
+         tmp2_QS_s32x2  = vsub_s32(tmp1_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32)));
+         t3_s64x2       = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32);
+         tmp2_QS_s32x2  = vshrn_n_s64(t3_s64x2, 16);
+         tmp2_QS_s32x2  = vadd_s32(vget_high_s32(tmp1_QS1_s32x4), tmp2_QS_s32x2);
+         vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0);
+
+         corr_QC3_s64x2 = vld1q_s64(corr_QC);
+         t3_s64x2       = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4));
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1_s64(corr_QC + 1, vget_high_s64(corr_QC3_s64x2));
+         t3_s64x2       = vmull_s32(tmp2_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)), 32)));
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64(vget_low_s64(corr_QC3_s64x2), vget_low_s64(t3_s64x2));
+         vst1_s64(corr_QC, corr_QC_s64x1);
+      }
+   }
+
+   for( ; n < length; n++ ) {
+      input_QS[ 0 ] = tmp1_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS );
+      /* Loop over allpass sections */
+      for( i = 0; i <= order; i++ ) {
+         /* Output of allpass section */
+         tmp2_QS[ 0 ]           = silk_SMLAWB( state_QS[ order - i ], state_QS[ order - i - 1 ] - tmp1_QS[ 0 ], warping_Q16 );
+         state_QS[ order - i ]  = tmp1_QS[ 0 ];
+         corr_QC[  order - i ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         tmp1_QS[ 0 ]           = tmp2_QS[ 0 ];
+      }
+   }
+   lsh = silk_CLZ64( corr_QC[ order ] ) - 35;
+   lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
+   *scale = -( QC + lsh );
+   silk_assert( *scale >= -30 && *scale <= 12 );
+   const int64x2_t lsh_s64x2 = vdupq_n_s64(lsh);
+   for( i = 0; i <= order - 3; i += 4 ) {
+      int64x2_t corr_QC0_s64x2 = vld1q_s64(corr_QC + i);
+      int64x2_t corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2);
+      corr_QC0_s64x2           = vshlq_s64(corr_QC0_s64x2, lsh_s64x2);
+      corr_QC1_s64x2           = vshlq_s64(corr_QC1_s64x2, lsh_s64x2);
+      int32x4_t corr_s32x4     = vcombine_s32(vmovn_s64(corr_QC1_s64x2), vmovn_s64(corr_QC0_s64x2));
+      corr_s32x4               = vrev64q_s32(corr_s32x4);
+      vst1q_s32(corr + order - i - 3, corr_s32x4);
+   }
+   if( lsh >= 0 ) {
+      for( ; i <= order; i++ ) {
+         corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QC[ i ], lsh ) );
+      }
+   } else {
+      for( ; i <= order; i++ ) {
+         corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QC[ i ], -lsh ) );
+      }
+   }
+   silk_assert( corr_QC[ order ] >= 0 ); /* If breaking, decrease QC*/
+}
+
+#endif /* FIXED_POINT */
diff --git a/silk/fixed/main_FIX.h b/silk/fixed/main_FIX.h
index 375b5eb..02f6802 100644
--- a/silk/fixed/main_FIX.h
+++ b/silk/fixed/main_FIX.h
@@ -36,6 +36,11 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "debug.h"
 #include "entenc.h"
 
+#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
+   || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/warped_autocorrelation_FIX_arm.h"
+#endif
+
 #ifndef FORCE_CPP_BUILD
 #ifdef __cplusplus
 extern "C"
@@ -47,6 +52,9 @@ extern "C"
 #define silk_encode_do_VAD_Fxx      silk_encode_do_VAD_FIX
 #define silk_encode_frame_Fxx       silk_encode_frame_FIX
 
+#define QC  10
+#define QS  14
+
 /*********************/
 /* Encoder Functions */
 /*********************/
@@ -121,7 +129,7 @@ void silk_noise_shape_analysis_FIX(
 );
 
 /* Autocorrelations for a warped frequency axis */
-void silk_warped_autocorrelation_FIX(
+void silk_warped_autocorrelation_FIX_c(
           opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
           opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
     const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
@@ -130,6 +138,11 @@ void silk_warped_autocorrelation_FIX(
     const opus_int                  order                                   /* I    Correlation order (even)                                                    */
 );
 
+#if !defined(OVERRIDE_silk_warped_autocorrelation_FIX)
+#define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order) \
+    (silk_warped_autocorrelation_FIX_c(corr, scale, input, warping_Q16, length, order))
+#endif
+
 /* Calculation of LTP state scaling */
 void silk_LTP_scale_ctrl_FIX(
     silk_encoder_state_FIX          *psEnc,                                 /* I/O  encoder state                                                               */
diff --git a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
index e803ef0..6916940 100644
--- a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
+++ b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
@@ -34,12 +34,6 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main_FIX.h"
 
-#undef QC
-#define QC  10
-
-#undef QS
-#define QS  14
-
 /* Autocorrelations for a warped frequency axis */
 #define OVERRIDE_silk_warped_autocorrelation_FIX
 void silk_warped_autocorrelation_FIX(
diff --git a/silk/fixed/warped_autocorrelation_FIX.c b/silk/fixed/warped_autocorrelation_FIX.c
index 6ca6c11..994c299 100644
--- a/silk/fixed/warped_autocorrelation_FIX.c
+++ b/silk/fixed/warped_autocorrelation_FIX.c
@@ -31,17 +31,13 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main_FIX.h"
 
-#define QC  10
-#define QS  14
-
 #if defined(MIPSr1_ASM)
 #include "mips/warped_autocorrelation_FIX_mipsr1.h"
 #endif
 
 
-#ifndef OVERRIDE_silk_warped_autocorrelation_FIX
 /* Autocorrelations for a warped frequency axis */
-void silk_warped_autocorrelation_FIX(
+void silk_warped_autocorrelation_FIX_c(
           opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
           opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
     const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
@@ -92,4 +88,3 @@ void silk_warped_autocorrelation_FIX(
     }
     silk_assert( corr_QC[ 0 ] >= 0 ); /* If breaking, decrease QC*/
 }
-#endif /* OVERRIDE_silk_warped_autocorrelation_FIX */
diff --git a/silk/tests/test_unit_optimization_warped_autocorrelation.c b/silk/tests/test_unit_optimization_warped_autocorrelation.c
new file mode 100644
index 0000000..b7d0ad0
--- /dev/null
+++ b/silk/tests/test_unit_optimization_warped_autocorrelation.c
@@ -0,0 +1,441 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+#include "main_FIX.h"
+
+#ifdef FIXED_POINT
+
+/* Unrolling the input loop by 8 is about 25% faster than unrolling by 4 on Chromebook with an ARMv7 Processor. */
+#define NUM_PARALLEL_INPUTS 8
+
+/* Keep this function here because it is the code base to optimize on different CPUs. */
+void silk_warped_autocorrelation_FIX_c_opt(
+         opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
+         opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
+   const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
+   const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
+   const opus_int                  length,                                 /* I    Length of input                                                             */
+   const opus_int                  order                                   /* I    Correlation order (even)                                                    */
+)
+{
+   opus_int   n = 0, i, j, lsh;
+   opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS];
+   opus_int32 input_QS[NUM_PARALLEL_INPUTS];
+   opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 2 ] = { 0 }; // Create one extra entry.
+   opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in the last prolog and the last inner loop.
+   opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+
+   /* Order must be even */
+   silk_assert( ( order & 1 ) == 0 );
+   silk_assert( 2 * QS - QC >= 0 );
+
+   /* Loop over samples */
+   if( order >= NUM_PARALLEL_INPUTS - 2 ) {
+      for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n += NUM_PARALLEL_INPUTS ) {
+         for( i = 0; i < NUM_PARALLEL_INPUTS; i++ ) {
+            input_QS[i] = tmp1_QS[i] = silk_LSHIFT32( (opus_int32)input[ n + i ], QS );
+         }
+
+         /* Loop over allpass sections */
+
+         /* -------------------- prolog 0 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order ];
+         state_QS[ order ]     = tmp1_QS[ 0 ];
+         corr_QC[ order ]     += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 1 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 1 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 1 ]          = state_QS[ order     ];
+
+         state_QS[ order - 1 ] = tmp1_QS[ 0 ];
+         state_QS[ order     ] = tmp1_QS[ 1 ];
+
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order ]     += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 1 ] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 2 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 2 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 2 ]          = state_QS[ order     ];
+
+         state_QS[ order - 2 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 1 ];
+         state_QS[ order     ] = tmp1_QS[ 2 ];
+
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 1 ] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 2 ] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 3 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 3 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 3 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 2 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 3 ]          = state_QS[ order     ];
+
+         state_QS[ order - 3 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 2 ] = tmp1_QS[ 1 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 2 ];
+         state_QS[ order     ] = tmp1_QS[ 3 ];
+
+         corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 3 ]          = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 1 ] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 2 ] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 3 ] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 4 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 4 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 4 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 3 ];
+         tmp2_QS[ 2 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 3 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 4 ]          = state_QS[ order     ];
+
+         state_QS[ order - 4 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 3 ] = tmp1_QS[ 1 ];
+         state_QS[ order - 2 ] = tmp1_QS[ 2 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 3 ];
+         state_QS[ order     ] = tmp1_QS[ 4 ];
+
+         corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 4 ]          = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 1 ] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]          = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 2 ] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 3 ] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 4 ] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 5 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 5 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 5 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 4 ];
+         tmp2_QS[ 2 ]          = state_QS[ order - 3 ];
+         tmp2_QS[ 3 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 4 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 5 ]          = state_QS[ order     ];
+
+         state_QS[ order - 5 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 4 ] = tmp1_QS[ 1 ];
+         state_QS[ order - 3 ] = tmp1_QS[ 2 ];
+         state_QS[ order - 2 ] = tmp1_QS[ 3 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 4 ];
+         state_QS[ order     ] = tmp1_QS[ 5 ];
+
+         corr_QC[ order - 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 5 ]          = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ order - 1 ] - tmp1_QS[ 5 ], warping_Q16 );
+         tmp1_QS[ 4 ]          = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 2 ] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]          = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 3 ] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 4 ] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 5 ] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 6 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 6 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 6 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 5 ];
+         tmp2_QS[ 2 ]          = state_QS[ order - 4 ];
+         tmp2_QS[ 3 ]          = state_QS[ order - 3 ];
+         tmp2_QS[ 4 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 5 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 6 ]          = state_QS[ order     ];
+
+         state_QS[ order - 6 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 5 ] = tmp1_QS[ 1 ];
+         state_QS[ order - 4 ] = tmp1_QS[ 2 ];
+         state_QS[ order - 3 ] = tmp1_QS[ 3 ];
+         state_QS[ order - 2 ] = tmp1_QS[ 4 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 5 ];
+         state_QS[ order     ] = tmp1_QS[ 6 ];
+
+         corr_QC[ order - 6 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 6 ]          = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ order - 1 ] - tmp1_QS[ 6 ], warping_Q16 );
+         tmp1_QS[ 5 ]          = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ order - 2 ] - tmp1_QS[ 5 ], warping_Q16 );
+         tmp1_QS[ 4 ]          = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 3 ] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]          = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 4 ] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 5 ] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 6 ] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 7 ] - tmp1_QS[ 0 ], warping_Q16 ); // Accessed one extra head entry when order is 6.
+
+         /* -------------------- kernel loop -------------------- */
+
+         for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) {
+            /* Output of allpass section */
+            for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) {
+               tmp2_QS[ j ] = state_QS[ order - i - NUM_PARALLEL_INPUTS + 1 + j ];
+            }
+
+            for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) {
+               state_QS[ order - i - NUM_PARALLEL_INPUTS + 1 + j ] = tmp1_QS[ j ];
+            }
+
+            for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) {
+               corr_QC[ order - i - NUM_PARALLEL_INPUTS + 1 + j ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ j ], input_QS[ j ] ), 2 * QS - QC );
+            }
+
+            for( j = NUM_PARALLEL_INPUTS - 1; j >= 0; j-- ) {
+               tmp1_QS[ j ] = silk_SMLAWB( tmp2_QS[ j ], state_QS[ order - i - NUM_PARALLEL_INPUTS + j ] - tmp1_QS[ j ], warping_Q16 ); // Accessed one extra head entry in the last loop.
+            }
+         }
+
+         /* -------------------- epilog 0 -------------------- */
+
+         tmp2_QS[ 2 ]  = state_QS[ 1 ];
+         tmp2_QS[ 3 ]  = state_QS[ 2 ];
+         tmp2_QS[ 4 ]  = state_QS[ 3 ];
+         tmp2_QS[ 5 ]  = state_QS[ 4 ];
+         tmp2_QS[ 6 ]  = state_QS[ 5 ];
+         tmp2_QS[ 7 ]  = state_QS[ 6 ];
+
+         state_QS[ 0 ] = tmp1_QS[ 1 ];
+         state_QS[ 1 ] = tmp1_QS[ 2 ];
+         state_QS[ 2 ] = tmp1_QS[ 3 ];
+         state_QS[ 3 ] = tmp1_QS[ 4 ];
+         state_QS[ 4 ] = tmp1_QS[ 5 ];
+         state_QS[ 5 ] = tmp1_QS[ 6 ];
+         state_QS[ 6 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+         corr_QC[ 6 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 7 ]  = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 5 ] - tmp1_QS[ 7 ], warping_Q16 );
+         tmp1_QS[ 6 ]  = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 4 ] - tmp1_QS[ 6 ], warping_Q16 );
+         tmp1_QS[ 5 ]  = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 3 ] - tmp1_QS[ 5 ], warping_Q16 );
+         tmp1_QS[ 4 ]  = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 2 ] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]  = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ 1 ] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]  = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ 0 ] - tmp1_QS[ 2 ], warping_Q16 );
+
+         /* -------------------- epilog 1 -------------------- */
+
+         tmp2_QS[ 3 ]  = state_QS[ 1 ];
+         tmp2_QS[ 4 ]  = state_QS[ 2 ];
+         tmp2_QS[ 5 ]  = state_QS[ 3 ];
+         tmp2_QS[ 6 ]  = state_QS[ 4 ];
+         tmp2_QS[ 7 ]  = state_QS[ 5 ];
+
+         state_QS[ 0 ] = tmp1_QS[ 2 ];
+         state_QS[ 1 ] = tmp1_QS[ 3 ];
+         state_QS[ 2 ] = tmp1_QS[ 4 ];
+         state_QS[ 3 ] = tmp1_QS[ 5 ];
+         state_QS[ 4 ] = tmp1_QS[ 6 ];
+         state_QS[ 5 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+         corr_QC[ 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 7 ]  = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 4 ] - tmp1_QS[ 7 ], warping_Q16 );
+         tmp1_QS[ 6 ]  = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 3 ] - tmp1_QS[ 6 ], warping_Q16 );
+         tmp1_QS[ 5 ]  = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 2 ] - tmp1_QS[ 5 ], warping_Q16 );
+         tmp1_QS[ 4 ]  = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 1 ] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]  = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ 0 ] - tmp1_QS[ 3 ], warping_Q16 );
+
+         /* -------------------- epilog 2 -------------------- */
+
+         tmp2_QS[ 4 ]  = state_QS[ 1 ];
+         tmp2_QS[ 5 ]  = state_QS[ 2 ];
+         tmp2_QS[ 6 ]  = state_QS[ 3 ];
+         tmp2_QS[ 7 ]  = state_QS[ 4 ];
+
+         state_QS[ 0 ] = tmp1_QS[ 3 ];
+         state_QS[ 1 ] = tmp1_QS[ 4 ];
+         state_QS[ 2 ] = tmp1_QS[ 5 ];
+         state_QS[ 3 ] = tmp1_QS[ 6 ];
+         state_QS[ 4 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+         corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 7 ]  = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 3 ] - tmp1_QS[ 7 ], warping_Q16 );
+         tmp1_QS[ 6 ]  = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 2 ] - tmp1_QS[ 6 ], warping_Q16 );
+         tmp1_QS[ 5 ]  = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 1 ] - tmp1_QS[ 5 ], warping_Q16 );
+         tmp1_QS[ 4 ]  = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 0 ] - tmp1_QS[ 4 ], warping_Q16 );
+
+         /* -------------------- epilog 3 -------------------- */
+
+         tmp2_QS[ 5 ]  = state_QS[ 1 ];
+         tmp2_QS[ 6 ]  = state_QS[ 2 ];
+         tmp2_QS[ 7 ]  = state_QS[ 3 ];
+
+         state_QS[ 0 ] = tmp1_QS[ 4 ];
+         state_QS[ 1 ] = tmp1_QS[ 5 ];
+         state_QS[ 2 ] = tmp1_QS[ 6 ];
+         state_QS[ 3 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+         corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 7 ]  = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 2 ] - tmp1_QS[ 7 ], warping_Q16 );
+         tmp1_QS[ 6 ]  = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 1 ] - tmp1_QS[ 6 ], warping_Q16 );
+         tmp1_QS[ 5 ]  = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 0 ] - tmp1_QS[ 5 ], warping_Q16 );
+
+         /* -------------------- epilog 4 -------------------- */
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+         opus_int32 tmp1_QS_2 = silk_SMLAWB( state_QS[ 1 ], tmp1_QS[ 5 ] - tmp1_QS[ 6 ], warping_Q16 );
+         state_QS[ 1 ]        = silk_SMLAWB( state_QS[ 2 ], tmp1_QS[ 6 ] - tmp1_QS[ 7 ], warping_Q16 );
+
+         /* -------------------- epilog 5 & 6 -------------------- */
+
+         state_QS[ 0 ] = silk_SMLAWB( tmp1_QS[ 6 ], tmp1_QS_2 - state_QS[ 1 ], warping_Q16 );
+         state_QS[ 2 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS_2,     input_QS[ 6 ] ), 2 * QS - QC )
+               +         silk_RSHIFT64( silk_SMULL( state_QS[ 0 ], input_QS[ 7 ] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( state_QS[ 1 ], input_QS[ 7 ] ), 2 * QS - QC );
+      }
+   }
+
+   for( ; n < length; n++ ) {
+      input_QS[ 0 ] = tmp1_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS );
+      /* Loop over allpass sections */
+      for( i = 0; i <= order; i++ ) {
+         /* Output of allpass section */
+         tmp2_QS[ 0 ]           = silk_SMLAWB( state_QS[ order - i ], state_QS[ order - i - 1 ] - tmp1_QS[ 0 ], warping_Q16 );
+         state_QS[ order - i ]  = tmp1_QS[ 0 ];
+         corr_QC[  order - i ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         tmp1_QS[ 0 ]          = tmp2_QS[ 0 ];
+      }
+   }
+   lsh = silk_CLZ64( corr_QC[ order ] ) - 35;
+   lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
+   *scale = -( QC + lsh );
+   silk_assert( *scale >= -30 && *scale <= 12 );
+   if( lsh >= 0 ) {
+      for( i = 0; i <= order; i++ ) {
+         corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QC[ i ], lsh ) );
+      }
+   } else {
+      for( i = 0; i <= order; i++ ) {
+         corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QC[ i ], -lsh ) );
+      }
+   }
+   silk_assert( corr_QC[ order ] >= 0 ); /* If breaking, decrease QC*/
+}
+
+#define MAX_LENGTH 360
+
+static int test_warped_autocorrelation(int arch)
+{
+   unsigned int i;
+   opus_int32 corrOrg[MAX_SHAPE_LPC_ORDER + 1], corrOpt[MAX_SHAPE_LPC_ORDER + 1];
+   opus_int   scaleOrg, scaleOpt;
+   opus_int16 input[MAX_LENGTH];
+   opus_int   warping_Q16, length, order;
+   (void)arch;
+
+   printf("%50s", "silk_warped_autocorrelation_FIX() ...");
+   for( order = 0; order <= MAX_SHAPE_LPC_ORDER; order += 2 ) // order must be even.
+   {
+      for( length = 0; length <= MAX_LENGTH; length++ )
+      {
+         for (i=0;i<MAX_LENGTH;++i)
+         {
+            input[i] = (rand() % 32767) - 16384;
+         }
+         warping_Q16 = rand() % 32767;
+         memcpy(corrOpt, corrOrg, sizeof(corrOrg));
+
+         silk_warped_autocorrelation_FIX_c(corrOrg, &scaleOrg, input, warping_Q16, length, order);
+         silk_warped_autocorrelation_FIX  (corrOpt, &scaleOpt, input, warping_Q16, length, order);
+         if (memcmp(corrOpt, corrOrg, sizeof(corrOrg)))
+         {
+            printf("order=%2d length=%3d failed!\n", order, length);
+            for (i=0;i<sizeof(corrOrg) / sizeof(*corrOrg);i++)
+            {
+               if (corrOrg[i] != corrOpt[i])
+               {
+                  printf("\ncorrOrg[%3d]=%12d, corrOpt[%3d]=%12d", i, corrOrg[i], i, corrOpt[i]);
+               }
+            }
+            printf("\n");
+            return -1;
+         }
+      }
+   }
+   printf(" passed!\n");
+   return 0;
+}
+#endif /* FIXED_POINT */
diff --git a/silk_headers.mk b/silk_headers.mk
index f8bf1d2..c826db9 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -28,6 +28,7 @@ silk/arm/macros_arm64.h \
 silk/arm/SigProc_FIX_armv4.h \
 silk/arm/SigProc_FIX_armv5e.h \
 silk/arm/NSQ_neon.h \
+silk/arm/warped_autocorrelation_FIX_arm.h \
 silk/fixed/main_FIX.h \
 silk/fixed/structs_FIX.h \
 silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h \
diff --git a/silk_sources.mk b/silk_sources.mk
index 7229ee3..62d2cdd 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -84,7 +84,8 @@ silk/x86/VQ_WMat_EC_sse.c
 
 SILK_SOURCES_ARM_NEON_INTR = \
 silk/arm/arm_silk_map.c \
-silk/arm/NSQ_neon.c
+silk/arm/NSQ_neon.c \
+silk/arm/warped_autocorrelation_FIX_neon_intr.c
 
 SILK_SOURCES_FIXED = \
 silk/fixed/LTP_analysis_filter_FIX.c \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
new file mode 100644
index 0000000..b5c25d9
--- /dev/null
+++ b/tests/test_unit_optimization.c
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include "stack_alloc.h"
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#ifdef FIXED_POINT
+
+# include "celt/tests/test_unit_optimization_lpc.c"
+# include "silk/tests/test_unit_optimization_warped_autocorrelation.c"
+
+#endif
+
+int main(void)
+{
+   int result = 0; /* 0: passed; other: failed */
+   ALLOC_STACK;
+#ifdef FIXED_POINT
+   int arch = opus_select_arch();
+#endif /* FIXED_POINT */
+   int count = 10;
+
+   while (!result && count--) {
+      printf("\n--------------------------- Testing optimization ---------------------------\n");
+#ifdef FIXED_POINT
+      result |= test_fir(arch);
+      result |= test_warped_autocorrelation(arch);
+#endif /* FIXED_POINT */
+   }
+   return result;
+}
-- 
2.8.0.rc3.226.g39d4020