[opus] [PATCH 3/5] Optimize silk_warped_autocorrelation_FIX() for ARM NEON

Linfeng Zhang linfengz at google.com
Thu Jul 14 00:49:00 UTC 2016


Create silk_warped_autocorrelation_FIX_c_opt() which unrolls and parallelizes
input by 8. It has very long prolog and epilog, but this is the cost to get
good speed on this heavily hit function. This function may be the code base for
optimization on different CPUs.
Create ARM NEON intrinsics optimization silk_warped_autocorrelation_FIX_neon().
Create unit test silk/tests/test_unit_optimization_warped_autocorrelation.c.
---
 Makefile.am                                        |   5 +-
 silk/arm/arm_silk_map.c                            |  20 +
 silk/fixed/arm/warped_autocorrelation_FIX_arm.h    |  65 +++
 .../arm/warped_autocorrelation_FIX_neon_intr.c     | 495 +++++++++++++++++++++
 silk/fixed/main_FIX.h                              |  15 +-
 .../fixed/mips/warped_autocorrelation_FIX_mipsr1.h |   6 -
 silk/fixed/warped_autocorrelation_FIX.c            |   7 +-
 ...test_unit_optimization_warped_autocorrelation.c | 441 ++++++++++++++++++
 silk_headers.mk                                    |   1 +
 silk_sources.mk                                    |   3 +
 tests/test_unit_optimization.c                     |   2 +
 11 files changed, 1046 insertions(+), 14 deletions(-)
 create mode 100644 silk/fixed/arm/warped_autocorrelation_FIX_arm.h
 create mode 100644 silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
 create mode 100644 silk/tests/test_unit_optimization_warped_autocorrelation.c

diff --git a/Makefile.am b/Makefile.am
index 2bfb923..c66fb2d 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -50,6 +50,7 @@ SILK_SOURCES += $(SILK_SOURCES_ARM)
 if HAVE_ARM_NEON_INTR
 CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
 SILK_SOURCES += $(SILK_SOURCES_ARM_NEON_INTR)
+SILK_SOURCES += $(SILK_SOURCES_FIXED_ARM_NEON_INTR)
 endif
 
 if HAVE_ARM_NE10
@@ -327,7 +328,9 @@ $(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS)
 endif
 
 if HAVE_ARM_NEON_INTR
-ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) $(SILK_SOURCES_ARM_NEON_INTR:.c=.lo)
+ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \
+                    $(SILK_SOURCES_ARM_NEON_INTR:.c=.lo) \
+                    $(SILK_SOURCES_FIXED_ARM_NEON_INTR:.c=.lo)
 $(ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += \
  $(OPUS_ARM_NEON_INTR_CFLAGS)  $(NE10_CFLAGS)
 endif
diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c
index 9bd86a7..2e330c4 100644
--- a/silk/arm/arm_silk_map.c
+++ b/silk/arm/arm_silk_map.c
@@ -28,6 +28,7 @@ POSSIBILITY OF SUCH DAMAGE.
 # include "config.h"
 #endif
 
+#include "main_FIX.h"
 #include "NSQ.h"
 
 #if defined(OPUS_HAVE_RTCD)
@@ -52,4 +53,23 @@ opus_int32
 
 # endif
 
+#if defined(FIXED_POINT) && \
+ defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+
+void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])(
+          opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
+          opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
+    const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
+    const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
+    const opus_int                  length,                                 /* I    Length of input                                                             */
+    const opus_int                  order                                   /* I    Correlation order (even)                                                    */
+) = {
+      silk_warped_autocorrelation_FIX_c,              /* ARMv4 */
+      silk_warped_autocorrelation_FIX_c,              /* EDSP */
+      silk_warped_autocorrelation_FIX_c,              /* Media */
+      MAY_HAVE_NEON(silk_warped_autocorrelation_FIX), /* Neon */
+};
+
+#endif
+
 #endif /* OPUS_HAVE_RTCD */
diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_arm.h b/silk/fixed/arm/warped_autocorrelation_FIX_arm.h
new file mode 100644
index 0000000..ee892bf
--- /dev/null
+++ b/silk/fixed/arm/warped_autocorrelation_FIX_arm.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(WARPED_AUTOCORRELATION_FIX_ARM_H)
+# define WARPED_AUTOCORRELATION_FIX_ARM_H
+
+# include "celt/arm/armcpu.h"
+
+# if defined(FIXED_POINT)
+
+#  if defined(OPUS_ARM_MAY_HAVE_NEON)
+void silk_warped_autocorrelation_FIX_neon(
+          opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
+          opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
+    const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
+    const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
+    const opus_int                  length,                                 /* I    Length of input                                                             */
+    const opus_int                  order                                   /* I    Correlation order (even)                                                    */
+);
+#  endif
+
+#  if !defined(OPUS_HAVE_RTCD)
+#   define OVERRIDE_silk_warped_autocorrelation_FIX (1)
+#   define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order) \
+  ((void)(arch),PRESUME_NEON(silk_warped_autocorrelation_FIX)(corr, scale, input, warping_Q16, length, order))
+#  endif
+
+#if !defined(OVERRIDE_silk_warped_autocorrelation_FIX)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_ASM) \
+   || (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) \
+   && !defined(OPUS_ARM_PRESUME_NEON_INTR)))
+extern void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK+1])(opus_int32*, opus_int*, const opus_int16*, const opus_int, const opus_int, const opus_int);
+
+#  define OVERRIDE_silk_warped_autocorrelation_FIX
+#   define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order) \
+  ((*SILK_WARPED_AUTOCORRELATION_FIX_IMPL[(arch)&OPUS_ARCHMASK])(corr, scale, input, warping_Q16, length, order))
+# endif
+#endif
+#endif /* end FIXED_POINT */
+
+#endif /* end WARPED_AUTOCORRELATION_FIX_ARM_H */
diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
new file mode 100644
index 0000000..80dd949
--- /dev/null
+++ b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
@@ -0,0 +1,495 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+   @file warped_autocorrelation_FIX_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for silk silk_warped_autocorrelation_FIX functions
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+
+#include <arm_neon.h>
+#include "stack_alloc.h"
+#include "main_FIX.h"
+
+#ifdef FIXED_POINT
+
+#define NUM_PARALLEL_INPUTS 8
+
+void silk_warped_autocorrelation_FIX_neon(
+         opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
+         opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
+   const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
+   const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
+   const opus_int                  length,                                 /* I    Length of input                                                             */
+   const opus_int                  order                                   /* I    Correlation order (even)                                                    */
+)
+{
+   opus_int   n = 0, i, lsh;
+   opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS];
+   opus_int32 input_QS[NUM_PARALLEL_INPUTS];
+   opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two extra entries.
+   opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in the last prolog and the last inner loop, and one extra end entry in the last prolog.
+   opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+
+   /* Order must be even */
+   silk_assert( ( order & 1 ) == 0 );
+   silk_assert( 2 * QS - QC >= 0 );
+
+   /* Loop over samples */
+   if( order >= NUM_PARALLEL_INPUTS - 2 ) {
+      const int32x2_t warping_Q16_s32 = vdup_n_s32(warping_Q16);
+      for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n += NUM_PARALLEL_INPUTS ) {
+         int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS0_s32x4, tmp2_QS1_s32x4;
+         int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, corr_QC2_s64x2, corr_QC3_s64x2;
+         int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2;
+         int32x2_t tmp1_QS_s32x2, tmp2_QS_s32x2;
+         int64x1_t corr_QC_s64x1;
+         const int32x4_t input_QS0_s32x4 = vshll_n_s16(vld1_s16(input + n),     QS);
+         const int32x4_t input_QS1_s32x4 = vshll_n_s16(vld1_s16(input + n + 4), QS);
+         vst1q_s32(tmp1_QS,      input_QS0_s32x4);
+         vst1q_s32(tmp1_QS  + 4, input_QS1_s32x4);
+
+         /* Loop over allpass sections */
+
+         /* -------------------- prolog 0 -------------------- */
+
+         tmp1_QS_s32x2  = vget_low_s32(input_QS0_s32x4);
+         tmp2_QS_s32x2  = vld1_s32(state_QS + order); // Accessed one extra end entry.
+         vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
+         corr_QC_s64x1  = vld1_s64(corr_QC + order);
+         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64(corr_QC_s64x1, vget_low_s64(t0_s64x2));
+         vst1_s64(corr_QC + order, corr_QC_s64x1);
+         tmp1_QS_s32x2  = vsub_s32(vld1_s32(state_QS + order - 1), tmp1_QS_s32x2);
+         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t0_s64x2, 16);
+         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+         tmp1_QS_s32x2  = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1);
+
+         /* -------------------- prolog 1 -------------------- */
+
+         tmp2_QS_s32x2  = vld1_s32(state_QS + order - 1);
+         vst1_s32(state_QS + order - 1, tmp1_QS_s32x2);
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 1);
+         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         vst1q_s64(corr_QC + order - 1, corr_QC0_s64x2);
+         tmp1_QS_s32x2  = vsub_s32(vld1_s32(state_QS + order - 2), tmp1_QS_s32x2);
+         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t0_s64x2, 16);
+         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+         tmp1_QS0_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4));
+
+         /* -------------------- prolog 2 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry.
+         vst1q_s32(state_QS + order - 2, tmp1_QS0_s32x4);  // Saving one extra entry is OK.
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 2);
+         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t1_s64x2));
+         vst1q_s64(corr_QC + order - 2, corr_QC0_s64x2);
+         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS0_s32x4);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS0_s32x4 = vld1q_lane_s32(tmp1_QS + 3, tmp1_QS0_s32x4, 3);
+
+         /* -------------------- prolog 3 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 3);
+         vst1q_s32(state_QS + order - 3, tmp1_QS0_s32x4);
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 3);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 1);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         vst1q_s64(corr_QC + order - 3, corr_QC0_s64x2);
+         vst1q_s64(corr_QC + order - 1, corr_QC1_s64x2);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 4), tmp1_QS0_s32x4);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS_s32x2  = vget_low_s32(input_QS1_s32x4);
+
+         /* -------------------- prolog 4 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 4);
+         tmp2_QS_s32x2  = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0);
+         vst1q_s32(state_QS + order - 4, tmp1_QS0_s32x4);
+         vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 4);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 2);
+         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_low_s32 (input_QS1_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t2_s64x2));
+         vst1q_s64(corr_QC + order - 4, corr_QC0_s64x2);
+         vst1q_s64(corr_QC + order - 2, corr_QC1_s64x2);
+         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 5), tmp1_QS0_s32x4);
+         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS + order - 1), tmp1_QS_s32x2);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS_s32x2  = vshrn_n_s64(t2_s64x2, 16);
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp1_QS_s32x2  = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1);
+
+         /* -------------------- prolog 5 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 5);
+         tmp2_QS_s32x2  = vld1_s32 (state_QS + order - 1);
+         vst1q_s32(state_QS + order - 5, tmp1_QS0_s32x4);
+         vst1_s32 (state_QS + order - 1, tmp1_QS_s32x2);
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 5);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 3);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 1);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_low_s32 (input_QS1_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         vst1q_s64(corr_QC + order - 5, corr_QC0_s64x2);
+         vst1q_s64(corr_QC + order - 3, corr_QC1_s64x2);
+         vst1q_s64(corr_QC + order - 1, corr_QC2_s64x2);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 6), tmp1_QS0_s32x4);
+         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS + order - 2), tmp1_QS_s32x2);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS_s32x2  = vshrn_n_s64(t2_s64x2, 16);
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp1_QS1_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4));
+
+         /* -------------------- prolog 6 -------------------- */
+
+         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 6);
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry.
+         vst1q_s32(state_QS + order - 6, tmp1_QS0_s32x4);
+         vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4);  // Saving one extra entry is OK.
+         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 6);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 4);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 2);
+         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t3_s64x2));
+         vst1q_s64(corr_QC + order - 6, corr_QC0_s64x2);
+         vst1q_s64(corr_QC + order - 4, corr_QC1_s64x2);
+         vst1q_s64(corr_QC + order - 2, corr_QC2_s64x2);
+         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
+         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 7), tmp1_QS0_s32x4); // Accessed one extra head entry when order is 6.
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS1_s32x4);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         tmp1_QS1_s32x4 = vld1q_lane_s32(tmp1_QS + 7, tmp1_QS1_s32x4, 3);
+
+         /* -------------------- kernel loop -------------------- */
+
+         for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) {
+             /* Output of allpass section */
+            tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1);
+            tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5);
+            vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1, tmp1_QS0_s32x4);
+            vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5, tmp1_QS1_s32x4);
+            corr_QC0_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1);
+            corr_QC1_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3);
+            corr_QC2_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5);
+            corr_QC3_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7);
+            t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+            t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+            t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+            t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+            t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+            t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+            t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+            t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+            corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+            corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+            corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+            corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1, corr_QC0_s64x2);
+            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3, corr_QC1_s64x2);
+            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5, corr_QC2_s64x2);
+            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7, corr_QC3_s64x2);
+            tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS),     tmp1_QS0_s32x4); // Accessed one extra head entry in the last loop.
+            tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 4), tmp1_QS1_s32x4);
+            t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+            t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+            t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+            t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+            tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+            tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+            tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+            tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         }
+
+         /* -------------------- epilog 0 -------------------- */
+
+         tmp2_QS_s32x2  = vld1_s32(state_QS + 1);
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 3);
+         vst1q_s32(state_QS - 1, tmp1_QS0_s32x4);  // Saving one extra entry is OK.
+         vst1q_s32(state_QS + 3, tmp1_QS1_s32x4);
+         corr_QC_s64x1  = vld1_s64 (corr_QC);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + 1);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + 3);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 5);
+         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t0_s64x2));
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1_s64 (corr_QC + 0, corr_QC_s64x1);
+         vst1q_s64(corr_QC + 1, corr_QC1_s64x2);
+         vst1q_s64(corr_QC + 3, corr_QC2_s64x2);
+         vst1q_s64(corr_QC + 5, corr_QC3_s64x2);
+         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS),     vget_high_s32(tmp1_QS0_s32x4));
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 2), tmp1_QS1_s32x4);
+         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                 warping_Q16_s32);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+         /* -------------------- epilog 1 -------------------- */
+
+         tmp2_QS_s32x2  = vld1_s32 (state_QS);
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 2);
+         vst1_s32 (state_QS,     tmp1_QS_s32x2);
+         vst1q_s32(state_QS + 2, tmp1_QS1_s32x4);
+         corr_QC1_s64x2 = vld1q_s64(corr_QC + 0);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + 2);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 4);
+         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1q_s64(corr_QC + 0, corr_QC1_s64x2);
+         vst1q_s64(corr_QC + 2, corr_QC2_s64x2);
+         vst1q_s64(corr_QC + 4, corr_QC3_s64x2);
+         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS - 1), tmp1_QS_s32x2); // Accessed one extra head entry.
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 1), tmp1_QS1_s32x4);
+         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                 warping_Q16_s32);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+         /* -------------------- epilog 2 -------------------- */
+
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 1);
+         vst1_lane_s32(state_QS,     tmp1_QS_s32x2, 1);
+         vst1q_s32    (state_QS + 1, tmp1_QS1_s32x4);
+         corr_QC_s64x1  = vld1_s64(corr_QC);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC + 1);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 3);
+         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_high_s32(input_QS0_s32x4));
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t1_s64x2));
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1_s64 (corr_QC + 0, corr_QC_s64x1);
+         vst1q_s64(corr_QC + 1, corr_QC2_s64x2);
+         vst1q_s64(corr_QC + 3, corr_QC3_s64x2);
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS), tmp1_QS1_s32x4);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+         /* -------------------- epilog 3 -------------------- */
+
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS);
+         vst1q_s32(state_QS, tmp1_QS1_s32x4);
+         corr_QC2_s64x2 = vld1q_s64(corr_QC);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 2);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1q_s64(corr_QC,     corr_QC2_s64x2);
+         vst1q_s64(corr_QC + 2, corr_QC3_s64x2);
+         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS - 1), tmp1_QS1_s32x4); // Accessed one extra head entry.
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+         /* -------------------- epilog 4 -------------------- */
+
+         corr_QC_s64x1  = vld1_s64 (corr_QC);
+         corr_QC3_s64x2 = vld1q_s64(corr_QC + 1);
+         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t2_s64x2));
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1_s64 (corr_QC,     corr_QC_s64x1);
+         vst1q_s64(corr_QC + 1, corr_QC3_s64x2);
+         vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4);
+
+         tmp2_QS_s32x2  = vld1_s32(state_QS + 1);
+         tmp1_QS_s32x2  = vsub_s32(vld1_s32(tmp1_QS + 5), vget_high_s32(tmp1_QS1_s32x4));
+         t3_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+         tmp1_QS_s32x2  = vshrn_n_s64(t3_s64x2, 16);
+         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+         vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1);
+
+         /* -------------------- epilog 5 & 6 -------------------- */
+
+         vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1);
+         tmp2_QS_s32x2  = vsub_s32(tmp1_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32)));
+         t3_s64x2       = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32);
+         tmp2_QS_s32x2  = vshrn_n_s64(t3_s64x2, 16);
+         tmp2_QS_s32x2  = vadd_s32(vget_high_s32(tmp1_QS1_s32x4), tmp2_QS_s32x2);
+         vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0);
+
+         corr_QC3_s64x2 = vld1q_s64(corr_QC);
+         t3_s64x2       = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4));
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+         vst1_s64(corr_QC + 1, vget_high_s64(corr_QC3_s64x2));
+         t3_s64x2       = vmull_s32(tmp2_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)), 32)));
+         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+         corr_QC_s64x1  = vadd_s64(vget_low_s64(corr_QC3_s64x2), vget_low_s64(t3_s64x2));
+         vst1_s64(corr_QC, corr_QC_s64x1);
+      }
+   }
+
+   for( ; n < length; n++ ) {
+      input_QS[ 0 ] = tmp1_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS );
+      /* Loop over allpass sections */
+      for( i = 0; i <= order; i++ ) {
+         /* Output of allpass section */
+         tmp2_QS[ 0 ]           = silk_SMLAWB( state_QS[ order - i ], state_QS[ order - i - 1 ] - tmp1_QS[ 0 ], warping_Q16 );
+         state_QS[ order - i ]  = tmp1_QS[ 0 ];
+         corr_QC[  order - i ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         tmp1_QS[ 0 ]           = tmp2_QS[ 0 ];
+      }
+   }
+   lsh = silk_CLZ64( corr_QC[ order ] ) - 35;
+   lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
+   *scale = -( QC + lsh );
+   silk_assert( *scale >= -30 && *scale <= 12 );
+   const int64x2_t lsh_s64x2 = vdupq_n_s64(lsh);
+   for( i = 0; i <= order - 3; i += 4 ) {
+      int64x2_t corr_QC0_s64x2 = vld1q_s64(corr_QC + i);
+      int64x2_t corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2);
+      corr_QC0_s64x2           = vshlq_s64(corr_QC0_s64x2, lsh_s64x2);
+      corr_QC1_s64x2           = vshlq_s64(corr_QC1_s64x2, lsh_s64x2);
+      int32x4_t corr_s32x4     = vcombine_s32(vmovn_s64(corr_QC1_s64x2), vmovn_s64(corr_QC0_s64x2));
+      corr_s32x4               = vrev64q_s32(corr_s32x4);
+      vst1q_s32(corr + order - i - 3, corr_s32x4);
+   }
+   if( lsh >= 0 ) {
+      for( ; i <= order; i++ ) {
+         corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QC[ i ], lsh ) );
+      }
+   } else {
+      for( ; i <= order; i++ ) {
+         corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QC[ i ], -lsh ) );
+      }
+   }
+   silk_assert( corr_QC[ order ] >= 0 ); /* If breaking, decrease QC*/
+}
+
+#endif /* FIXED_POINT */
diff --git a/silk/fixed/main_FIX.h b/silk/fixed/main_FIX.h
index 375b5eb..2abb5d9 100644
--- a/silk/fixed/main_FIX.h
+++ b/silk/fixed/main_FIX.h
@@ -36,6 +36,11 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "debug.h"
 #include "entenc.h"
 
+#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
+   || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "fixed/arm/warped_autocorrelation_FIX_arm.h"
+#endif
+
 #ifndef FORCE_CPP_BUILD
 #ifdef __cplusplus
 extern "C"
@@ -47,6 +52,9 @@ extern "C"
 #define silk_encode_do_VAD_Fxx      silk_encode_do_VAD_FIX
 #define silk_encode_frame_Fxx       silk_encode_frame_FIX
 
+#define QC  10
+#define QS  14
+
 /*********************/
 /* Encoder Functions */
 /*********************/
@@ -121,7 +129,7 @@ void silk_noise_shape_analysis_FIX(
 );
 
 /* Autocorrelations for a warped frequency axis */
-void silk_warped_autocorrelation_FIX(
+void silk_warped_autocorrelation_FIX_c(
           opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
           opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
     const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
@@ -130,6 +138,11 @@ void silk_warped_autocorrelation_FIX(
     const opus_int                  order                                   /* I    Correlation order (even)                                                    */
 );
 
+#if !defined(OVERRIDE_silk_warped_autocorrelation_FIX)
+#define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order) \
+    (silk_warped_autocorrelation_FIX_c(corr, scale, input, warping_Q16, length, order))
+#endif
+
 /* Calculation of LTP state scaling */
 void silk_LTP_scale_ctrl_FIX(
     silk_encoder_state_FIX          *psEnc,                                 /* I/O  encoder state                                                               */
diff --git a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
index e803ef0..6916940 100644
--- a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
+++ b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
@@ -34,12 +34,6 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main_FIX.h"
 
-#undef QC
-#define QC  10
-
-#undef QS
-#define QS  14
-
 /* Autocorrelations for a warped frequency axis */
 #define OVERRIDE_silk_warped_autocorrelation_FIX
 void silk_warped_autocorrelation_FIX(
diff --git a/silk/fixed/warped_autocorrelation_FIX.c b/silk/fixed/warped_autocorrelation_FIX.c
index 6ca6c11..994c299 100644
--- a/silk/fixed/warped_autocorrelation_FIX.c
+++ b/silk/fixed/warped_autocorrelation_FIX.c
@@ -31,17 +31,13 @@ POSSIBILITY OF SUCH DAMAGE.
 
 #include "main_FIX.h"
 
-#define QC  10
-#define QS  14
-
 #if defined(MIPSr1_ASM)
 #include "mips/warped_autocorrelation_FIX_mipsr1.h"
 #endif
 
 
-#ifndef OVERRIDE_silk_warped_autocorrelation_FIX
 /* Autocorrelations for a warped frequency axis */
-void silk_warped_autocorrelation_FIX(
+void silk_warped_autocorrelation_FIX_c(
           opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
           opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
     const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
@@ -92,4 +88,3 @@ void silk_warped_autocorrelation_FIX(
     }
     silk_assert( corr_QC[ 0 ] >= 0 ); /* If breaking, decrease QC*/
 }
-#endif /* OVERRIDE_silk_warped_autocorrelation_FIX */
diff --git a/silk/tests/test_unit_optimization_warped_autocorrelation.c b/silk/tests/test_unit_optimization_warped_autocorrelation.c
new file mode 100644
index 0000000..b7d0ad0
--- /dev/null
+++ b/silk/tests/test_unit_optimization_warped_autocorrelation.c
@@ -0,0 +1,441 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+#include "main_FIX.h"
+
+#ifdef FIXED_POINT
+
+/* Unrolling the input loop by 8 is about 25% faster than unrolling by 4 on Chromebook with an ARMv7 Processor. */
+#define NUM_PARALLEL_INPUTS 8
+
+/* Keep this function here because it is the code base to optimize on different CPUs. */
+void silk_warped_autocorrelation_FIX_c_opt(
+         opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
+         opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
+   const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
+   const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
+   const opus_int                  length,                                 /* I    Length of input                                                             */
+   const opus_int                  order                                   /* I    Correlation order (even)                                                    */
+)
+{
+   opus_int   n = 0, i, j, lsh;
+   opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS];
+   opus_int32 input_QS[NUM_PARALLEL_INPUTS];
+   opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 2 ] = { 0 }; // Create one extra entry.
+   opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in the last prolog and the last inner loop.
+   opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+
+   /* Order must be even */
+   silk_assert( ( order & 1 ) == 0 );
+   silk_assert( 2 * QS - QC >= 0 );
+
+   /* Loop over samples */
+   if( order >= NUM_PARALLEL_INPUTS - 2 ) {
+      for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n += NUM_PARALLEL_INPUTS ) {
+         for( i = 0; i < NUM_PARALLEL_INPUTS; i++ ) {
+            input_QS[i] = tmp1_QS[i] = silk_LSHIFT32( (opus_int32)input[ n + i ], QS );
+         }
+
+         /* Loop over allpass sections */
+
+         /* -------------------- prolog 0 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order ];
+         state_QS[ order ]     = tmp1_QS[ 0 ];
+         corr_QC[ order ]     += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 1 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 1 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 1 ]          = state_QS[ order     ];
+
+         state_QS[ order - 1 ] = tmp1_QS[ 0 ];
+         state_QS[ order     ] = tmp1_QS[ 1 ];
+
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order ]     += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 1 ] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 2 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 2 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 2 ]          = state_QS[ order     ];
+
+         state_QS[ order - 2 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 1 ];
+         state_QS[ order     ] = tmp1_QS[ 2 ];
+
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 1 ] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 2 ] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 3 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 3 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 3 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 2 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 3 ]          = state_QS[ order     ];
+
+         state_QS[ order - 3 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 2 ] = tmp1_QS[ 1 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 2 ];
+         state_QS[ order     ] = tmp1_QS[ 3 ];
+
+         corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 3 ]          = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 1 ] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 2 ] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 3 ] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 4 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 4 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 4 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 3 ];
+         tmp2_QS[ 2 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 3 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 4 ]          = state_QS[ order     ];
+
+         state_QS[ order - 4 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 3 ] = tmp1_QS[ 1 ];
+         state_QS[ order - 2 ] = tmp1_QS[ 2 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 3 ];
+         state_QS[ order     ] = tmp1_QS[ 4 ];
+
+         corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 4 ]          = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 1 ] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]          = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 2 ] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 3 ] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 4 ] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 5 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 5 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 5 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 4 ];
+         tmp2_QS[ 2 ]          = state_QS[ order - 3 ];
+         tmp2_QS[ 3 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 4 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 5 ]          = state_QS[ order     ];
+
+         state_QS[ order - 5 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 4 ] = tmp1_QS[ 1 ];
+         state_QS[ order - 3 ] = tmp1_QS[ 2 ];
+         state_QS[ order - 2 ] = tmp1_QS[ 3 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 4 ];
+         state_QS[ order     ] = tmp1_QS[ 5 ];
+
+         corr_QC[ order - 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 5 ]          = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ order - 1 ] - tmp1_QS[ 5 ], warping_Q16 );
+         tmp1_QS[ 4 ]          = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 2 ] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]          = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 3 ] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 4 ] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 5 ] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 6 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+         /* -------------------- prolog 6 -------------------- */
+
+         tmp2_QS[ 0 ]          = state_QS[ order - 6 ];
+         tmp2_QS[ 1 ]          = state_QS[ order - 5 ];
+         tmp2_QS[ 2 ]          = state_QS[ order - 4 ];
+         tmp2_QS[ 3 ]          = state_QS[ order - 3 ];
+         tmp2_QS[ 4 ]          = state_QS[ order - 2 ];
+         tmp2_QS[ 5 ]          = state_QS[ order - 1 ];
+         tmp2_QS[ 6 ]          = state_QS[ order     ];
+
+         state_QS[ order - 6 ] = tmp1_QS[ 0 ];
+         state_QS[ order - 5 ] = tmp1_QS[ 1 ];
+         state_QS[ order - 4 ] = tmp1_QS[ 2 ];
+         state_QS[ order - 3 ] = tmp1_QS[ 3 ];
+         state_QS[ order - 2 ] = tmp1_QS[ 4 ];
+         state_QS[ order - 1 ] = tmp1_QS[ 5 ];
+         state_QS[ order     ] = tmp1_QS[ 6 ];
+
+         corr_QC[ order - 6 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         corr_QC[ order - 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ order     ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 6 ]          = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ order - 1 ] - tmp1_QS[ 6 ], warping_Q16 );
+         tmp1_QS[ 5 ]          = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ order - 2 ] - tmp1_QS[ 5 ], warping_Q16 );
+         tmp1_QS[ 4 ]          = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 3 ] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]          = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 4 ] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]          = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 5 ] - tmp1_QS[ 2 ], warping_Q16 );
+         tmp1_QS[ 1 ]          = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 6 ] - tmp1_QS[ 1 ], warping_Q16 );
+         tmp1_QS[ 0 ]          = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 7 ] - tmp1_QS[ 0 ], warping_Q16 ); // Accessed one extra head entry when order is 6.
+
+         /* -------------------- kernel loop -------------------- */
+
+         for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) {
+            /* Output of allpass section */
+            for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) {
+               tmp2_QS[ j ] = state_QS[ order - i - NUM_PARALLEL_INPUTS + 1 + j ];
+            }
+
+            for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) {
+               state_QS[ order - i - NUM_PARALLEL_INPUTS + 1 + j ] = tmp1_QS[ j ];
+            }
+
+            for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) {
+               corr_QC[ order - i - NUM_PARALLEL_INPUTS + 1 + j ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ j ], input_QS[ j ] ), 2 * QS - QC );
+            }
+
+            for( j = NUM_PARALLEL_INPUTS - 1; j >= 0; j-- ) {
+               tmp1_QS[ j ] = silk_SMLAWB( tmp2_QS[ j ], state_QS[ order - i - NUM_PARALLEL_INPUTS + j ] - tmp1_QS[ j ], warping_Q16 ); // Accessed one extra head entry in the last loop.
+            }
+         }
+
+         /* -------------------- epilog 0 -------------------- */
+
+         tmp2_QS[ 2 ]  = state_QS[ 1 ];
+         tmp2_QS[ 3 ]  = state_QS[ 2 ];
+         tmp2_QS[ 4 ]  = state_QS[ 3 ];
+         tmp2_QS[ 5 ]  = state_QS[ 4 ];
+         tmp2_QS[ 6 ]  = state_QS[ 5 ];
+         tmp2_QS[ 7 ]  = state_QS[ 6 ];
+
+         state_QS[ 0 ] = tmp1_QS[ 1 ];
+         state_QS[ 1 ] = tmp1_QS[ 2 ];
+         state_QS[ 2 ] = tmp1_QS[ 3 ];
+         state_QS[ 3 ] = tmp1_QS[ 4 ];
+         state_QS[ 4 ] = tmp1_QS[ 5 ];
+         state_QS[ 5 ] = tmp1_QS[ 6 ];
+         state_QS[ 6 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+         corr_QC[ 6 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 7 ]  = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 5 ] - tmp1_QS[ 7 ], warping_Q16 );
+         tmp1_QS[ 6 ]  = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 4 ] - tmp1_QS[ 6 ], warping_Q16 );
+         tmp1_QS[ 5 ]  = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 3 ] - tmp1_QS[ 5 ], warping_Q16 );
+         tmp1_QS[ 4 ]  = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 2 ] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]  = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ 1 ] - tmp1_QS[ 3 ], warping_Q16 );
+         tmp1_QS[ 2 ]  = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ 0 ] - tmp1_QS[ 2 ], warping_Q16 );
+
+         /* -------------------- epilog 1 -------------------- */
+
+         tmp2_QS[ 3 ]  = state_QS[ 1 ];
+         tmp2_QS[ 4 ]  = state_QS[ 2 ];
+         tmp2_QS[ 5 ]  = state_QS[ 3 ];
+         tmp2_QS[ 6 ]  = state_QS[ 4 ];
+         tmp2_QS[ 7 ]  = state_QS[ 5 ];
+
+         state_QS[ 0 ] = tmp1_QS[ 2 ];
+         state_QS[ 1 ] = tmp1_QS[ 3 ];
+         state_QS[ 2 ] = tmp1_QS[ 4 ];
+         state_QS[ 3 ] = tmp1_QS[ 5 ];
+         state_QS[ 4 ] = tmp1_QS[ 6 ];
+         state_QS[ 5 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+         corr_QC[ 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 7 ]  = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 4 ] - tmp1_QS[ 7 ], warping_Q16 );
+         tmp1_QS[ 6 ]  = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 3 ] - tmp1_QS[ 6 ], warping_Q16 );
+         tmp1_QS[ 5 ]  = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 2 ] - tmp1_QS[ 5 ], warping_Q16 );
+         tmp1_QS[ 4 ]  = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 1 ] - tmp1_QS[ 4 ], warping_Q16 );
+         tmp1_QS[ 3 ]  = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ 0 ] - tmp1_QS[ 3 ], warping_Q16 );
+
+         /* -------------------- epilog 2 -------------------- */
+
+         tmp2_QS[ 4 ]  = state_QS[ 1 ];
+         tmp2_QS[ 5 ]  = state_QS[ 2 ];
+         tmp2_QS[ 6 ]  = state_QS[ 3 ];
+         tmp2_QS[ 7 ]  = state_QS[ 4 ];
+
+         state_QS[ 0 ] = tmp1_QS[ 3 ];
+         state_QS[ 1 ] = tmp1_QS[ 4 ];
+         state_QS[ 2 ] = tmp1_QS[ 5 ];
+         state_QS[ 3 ] = tmp1_QS[ 6 ];
+         state_QS[ 4 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+         corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 7 ]  = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 3 ] - tmp1_QS[ 7 ], warping_Q16 );
+         tmp1_QS[ 6 ]  = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 2 ] - tmp1_QS[ 6 ], warping_Q16 );
+         tmp1_QS[ 5 ]  = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 1 ] - tmp1_QS[ 5 ], warping_Q16 );
+         tmp1_QS[ 4 ]  = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 0 ] - tmp1_QS[ 4 ], warping_Q16 );
+
+         /* -------------------- epilog 3 -------------------- */
+
+         tmp2_QS[ 5 ]  = state_QS[ 1 ];
+         tmp2_QS[ 6 ]  = state_QS[ 2 ];
+         tmp2_QS[ 7 ]  = state_QS[ 3 ];
+
+         state_QS[ 0 ] = tmp1_QS[ 4 ];
+         state_QS[ 1 ] = tmp1_QS[ 5 ];
+         state_QS[ 2 ] = tmp1_QS[ 6 ];
+         state_QS[ 3 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+         corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+         tmp1_QS[ 7 ]  = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 2 ] - tmp1_QS[ 7 ], warping_Q16 );
+         tmp1_QS[ 6 ]  = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 1 ] - tmp1_QS[ 6 ], warping_Q16 );
+         tmp1_QS[ 5 ]  = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 0 ] - tmp1_QS[ 5 ], warping_Q16 );
+
+         /* -------------------- epilog 4 -------------------- */
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+         corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+         opus_int32 tmp1_QS_2 = silk_SMLAWB( state_QS[ 1 ], tmp1_QS[ 5 ] - tmp1_QS[ 6 ], warping_Q16 );
+         state_QS[ 1 ]        = silk_SMLAWB( state_QS[ 2 ], tmp1_QS[ 6 ] - tmp1_QS[ 7 ], warping_Q16 );
+
+         /* -------------------- epilog 5 & 6 -------------------- */
+
+         state_QS[ 0 ] = silk_SMLAWB( tmp1_QS[ 6 ], tmp1_QS_2 - state_QS[ 1 ], warping_Q16 );
+         state_QS[ 2 ] = tmp1_QS[ 7 ];
+
+         corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS_2,     input_QS[ 6 ] ), 2 * QS - QC )
+               +         silk_RSHIFT64( silk_SMULL( state_QS[ 0 ], input_QS[ 7 ] ), 2 * QS - QC );
+         corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( state_QS[ 1 ], input_QS[ 7 ] ), 2 * QS - QC );
+      }
+   }
+
+   for( ; n < length; n++ ) {
+      input_QS[ 0 ] = tmp1_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS );
+      /* Loop over allpass sections */
+      for( i = 0; i <= order; i++ ) {
+         /* Output of allpass section */
+         tmp2_QS[ 0 ]           = silk_SMLAWB( state_QS[ order - i ], state_QS[ order - i - 1 ] - tmp1_QS[ 0 ], warping_Q16 );
+         state_QS[ order - i ]  = tmp1_QS[ 0 ];
+         corr_QC[  order - i ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+         tmp1_QS[ 0 ]          = tmp2_QS[ 0 ];
+      }
+   }
+   lsh = silk_CLZ64( corr_QC[ order ] ) - 35;
+   lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
+   *scale = -( QC + lsh );
+   silk_assert( *scale >= -30 && *scale <= 12 );
+   if( lsh >= 0 ) {
+      for( i = 0; i <= order; i++ ) {
+         corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QC[ i ], lsh ) );
+      }
+   } else {
+      for( i = 0; i <= order; i++ ) {
+         corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QC[ i ], -lsh ) );
+      }
+   }
+   silk_assert( corr_QC[ order ] >= 0 ); /* If breaking, decrease QC*/
+}
+
+#define MAX_LENGTH 360
+
+static int test_warped_autocorrelation(int arch)
+{
+   unsigned int i;
+   opus_int32 corrOrg[MAX_SHAPE_LPC_ORDER + 1], corrOpt[MAX_SHAPE_LPC_ORDER + 1];
+   opus_int   scaleOrg, scaleOpt;
+   opus_int16 input[MAX_LENGTH];
+   opus_int   warping_Q16, length, order;
+   (void)arch;
+
+   printf("%50s", "silk_warped_autocorrelation_FIX() ...");
+   for( order = 0; order <= MAX_SHAPE_LPC_ORDER; order += 2 ) // order must be even.
+   {
+      for( length = 0; length <= MAX_LENGTH; length++ )
+      {
+         for (i=0;i<MAX_LENGTH;++i)
+         {
+            input[i] = (rand() % 32767) - 16384;
+         }
+         warping_Q16 = rand() % 32767;
+         memcpy(corrOpt, corrOrg, sizeof(corrOrg));
+
+         silk_warped_autocorrelation_FIX_c(corrOrg, &scaleOrg, input, warping_Q16, length, order);
+         silk_warped_autocorrelation_FIX  (corrOpt, &scaleOpt, input, warping_Q16, length, order);
+         if (memcmp(corrOpt, corrOrg, sizeof(corrOrg)))
+         {
+            printf("order=%2d length=%3d failed!\n", order, length);
+            for (i=0;i<sizeof(corrOrg) / sizeof(*corrOrg);i++)
+            {
+               if (corrOrg[i] != corrOpt[i])
+               {
+                  printf("\ncorrOrg[%3d]=%12d, corrOpt[%3d]=%12d", i, corrOrg[i], i, corrOpt[i]);
+               }
+            }
+            printf("\n");
+            return -1;
+         }
+      }
+   }
+   printf(" passed!\n");
+   return 0;
+}
+#endif /* FIXED_POINT */
diff --git a/silk_headers.mk b/silk_headers.mk
index f8bf1d2..52c42d0 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -30,6 +30,7 @@ silk/arm/SigProc_FIX_armv5e.h \
 silk/arm/NSQ_neon.h \
 silk/fixed/main_FIX.h \
 silk/fixed/structs_FIX.h \
+silk/fixed/arm/warped_autocorrelation_FIX_arm.h \
 silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h \
 silk/fixed/mips/prefilter_FIX_mipsr1.h \
 silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h \
diff --git a/silk_sources.mk b/silk_sources.mk
index 7229ee3..5f9551b 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -117,6 +117,9 @@ SILK_SOURCES_FIXED_SSE4_1 = silk/fixed/x86/vector_ops_FIX_sse.c \
 silk/fixed/x86/burg_modified_FIX_sse.c \
 silk/fixed/x86/prefilter_FIX_sse.c
 
+SILK_SOURCES_FIXED_ARM_NEON_INTR = \
+silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
+
 SILK_SOURCES_FLOAT = \
 silk/float/apply_sine_window_FLP.c \
 silk/float/corrMatrix_FLP.c \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
index 7eeab38..b5c25d9 100644
--- a/tests/test_unit_optimization.c
+++ b/tests/test_unit_optimization.c
@@ -40,6 +40,7 @@
 #ifdef FIXED_POINT
 
 # include "celt/tests/test_unit_optimization_lpc.c"
+# include "silk/tests/test_unit_optimization_warped_autocorrelation.c"
 
 #endif
 
@@ -56,6 +57,7 @@ int main(void)
       printf("\n--------------------------- Testing optimization ---------------------------\n");
 #ifdef FIXED_POINT
       result |= test_fir(arch);
+      result |= test_warped_autocorrelation(arch);
 #endif /* FIXED_POINT */
    }
    return result;
-- 
2.8.0.rc3.226.g39d4020



More information about the opus mailing list