[opus] [PATCH 4/5] Refactor silk_warped_autocorrelation_FIX_neon()

Thu Jul 14 00:49:01 UTC 2016

Clean the code by defining macros.
---
 .../arm/warped_autocorrelation_FIX_neon_intr.c     | 637 ++++++++++-----------
 1 file changed, 287 insertions(+), 350 deletions(-)

diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
index 80dd949..6071445 100644
--- a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
+++ b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
@@ -40,7 +40,6 @@
 #endif
 
 #include <stdio.h>
-
 #include <arm_neon.h>
 #include "stack_alloc.h"
 #include "main_FIX.h"
@@ -49,6 +48,190 @@
 
 #define NUM_PARALLEL_INPUTS 8
 
+#define vget_all(x) (x)
+
+/* Calculate 1 or 2 elements of corr_QC and tmp1_QS in prolog. */
+#define CORRELATION_PROLOG_1_OR_2(                                             \
+   corr_QC,           /* I/O  corr_QC buffer. Updated 1 or 2 elements.    */   \
+   state_QS,          /* I    state_QS buffer.                            */   \
+   offset,            /* I    The address offset of corr_QC and state_QS. */   \
+   input_QS0_s32x4,   /* I    Input_QS elements 0 to 3.                   */   \
+   warping_Q16_s32x2, /* I    Warping coefficient in all vector lanes.    */   \
+   tmp1_QS_s32x2,     /* I/O  Either 1 or 2 elements of tmp1_QS.          */   \
+   tmp2_QS_s32x2,     /* I    Either 1 or 2 elements of tmp2_QS.          */   \
+   int64xX_t,         /*      Either int64x1_t or int64x2_t.              */   \
+   vget_X,            /*      Either vget_low_s64 or vget_all.            */   \
+   vld1X_s64,         /*      Either vld1_s64 or vld1q_s64.               */   \
+   vst1X_s64,         /*      Either vst1_s64 or vst1q_s64.               */   \
+   vaddX_s64          /*      Either vadd_s64 or vaddq_s64.               */   \
+)                                                                              \
+{                                                                              \
+   int64xX_t corr_QC_s64xX;                                                    \
+   int64x2_t t_s64x2;                                                          \
+   corr_QC_s64xX = (vld1X_s64)(corr_QC + (offset));                            \
+   t_s64x2       = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4));    \
+   t_s64x2       = vshrq_n_s64(t_s64x2, 2 * QS - QC);                          \
+   corr_QC_s64xX = (vaddX_s64)(corr_QC_s64xX, vget_X(t_s64x2));                \
+   (vst1X_s64)(corr_QC + (offset), corr_QC_s64xX);                             \
+   tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + (offset) - 1), tmp1_QS_s32x2); \
+   t_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32x2);                \
+   tmp1_QS_s32x2 = vshrn_n_s64(t_s64x2, 16);                                   \
+   tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);                     \
+}
+
+/* Calculate 3 or 4 elements of corr_QC, state_QS and tmp1_QS in prolog. */
+#define CORRELATION_PROLOG_3_OR_4(                                                            \
+   corr_QC,           /* I/O  corr_QC buffer. Updated 3 or 4 elements.    */                  \
+   state_QS,          /* I/O  state_QS buffer. Updated 4 elements.        */                  \
+   offset,            /* I    The address offset of corr_QC and state_QS. */                  \
+   input_QS0_s32x4,   /* I    Input_QS elements 0 to 3.                   */                  \
+   warping_Q16_s32x2, /* I    Warping coefficient in all vector lanes.    */                  \
+   tmp1_QS0_s32x4,    /* O    Updated 3 or 4 elements of tmp1_QS.         */                  \
+   int64xX_t,         /*      Either int64x1_t or int64x2_t.              */                  \
+   vget_X,            /*      Either vget_low_s64 or vget_all.            */                  \
+   vld1X_s64,         /*      Either vld1_s64 or vld1q_s64.               */                  \
+   vst1X_s64,         /*      Either vst1_s64 or vst1q_s64.               */                  \
+   vaddX_s64          /*      Either vadd_s64 or vaddq_s64.               */                  \
+)                                                                                             \
+{                                                                                             \
+   int32x4_t tmp2_QS_s32x4;                                                                   \
+   int64x2_t corr_QC0_s64x2, t0_s64x2, t1_s64x2;                                              \
+   int64xX_t corr_QC_s64xX;                                                                   \
+   tmp2_QS_s32x4  = vld1q_s32(state_QS + (offset));                                           \
+   vst1q_s32(state_QS + (offset), tmp1_QS0_s32x4);                                            \
+   corr_QC0_s64x2 = vld1q_s64  (corr_QC + (offset));                                          \
+   corr_QC_s64xX  = (vld1X_s64)(corr_QC + (offset) + 2);                                      \
+   t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); \
+   t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); \
+   t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);                                       \
+   t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);                                       \
+   corr_QC0_s64x2 = vaddq_s64  (corr_QC0_s64x2, t0_s64x2);                                    \
+   corr_QC_s64xX  = (vaddX_s64)(corr_QC_s64xX,  vget_X(t1_s64x2));                            \
+   vst1q_s64  (corr_QC + (offset),     corr_QC0_s64x2);                                       \
+   (vst1X_s64)(corr_QC + (offset) + 2, corr_QC_s64xX);                                        \
+   tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + (offset) - 1), tmp1_QS0_s32x4);            \
+   t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32x2);              \
+   t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32x2);              \
+   tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));       \
+   tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS_s32x4);                                 \
+}
+
+/* Calculate 4 elements of corr_QC, state_QS and tmp1_QS in prolog and kernel loop. */
+#define CORRELATION_4(offset) CORRELATION_PROLOG_3_OR_4(corr_QC, state_QS, offset, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS0_s32x4, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
+
+/* Calculate 3 or 4 elements of corr_QC and tmp1_QS. */
+#define CORRELATION_NEXT_3_OR_4(                                                              \
+   corr_QC,           /* I/O  corr_QC buffer. Updated 3 or 4 elements.    */                  \
+   state_QS,          /* I    state_QS buffer.                            */                  \
+   offset,            /* I    The address offset of corr_QC and state_QS. */                  \
+   input_QS1_s32x4,   /* I    4 elements of input_QS.                     */                  \
+   tmp1_QS1_s32x4,    /* I/O  Either 3 or 4 elements of tmp1_QS.          */                  \
+   tmp2_QS1_s32x4,    /* I    Either 3 or 4 elements of tmp2_QS.          */                  \
+   warping_Q16_s32x2, /* I    Warping coefficient in all vector lanes.    */                  \
+   int64xX_t,         /*      Either int64x1_t or int64x2_t.              */                  \
+   vget_X,            /*      Either vget_low_s64 or vget_all.            */                  \
+   vld1X_s64,         /*      Either vld1_s64 or vld1q_s64.               */                  \
+   vst1X_s64,         /*      Either vst1_s64 or vst1q_s64.               */                  \
+   vaddX_s64          /*      Either vadd_s64 or vaddq_s64.               */                  \
+)                                                                                             \
+{                                                                                             \
+   int64x2_t corr_QC0_s64x2, t0_s64x2, t1_s64x2;                                              \
+   int64xX_t corr_QC_s64xX;                                                                   \
+   corr_QC0_s64x2 = vld1q_s64  (corr_QC + (offset));                                          \
+   corr_QC_s64xX  = (vld1X_s64)(corr_QC + (offset) + 2);                                      \
+   t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); \
+   t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); \
+   t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);                                       \
+   t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);                                       \
+   corr_QC0_s64x2 = vaddq_s64  (corr_QC0_s64x2, t0_s64x2);                                    \
+   corr_QC_s64xX  = (vaddX_s64)(corr_QC_s64xX,  vget_X(t1_s64x2));                            \
+   vst1q_s64  (corr_QC + (offset),     corr_QC0_s64x2);                                       \
+   (vst1X_s64)(corr_QC + (offset) + 2, corr_QC_s64xX);                                        \
+   tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + (offset) - 1), tmp1_QS1_s32x4);            \
+   t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32x2);              \
+   t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32x2);              \
+   tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));       \
+   tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);                                \
+}
+
+/* Calculate 1 or 2 elements of corr_QC and tmp1_QS. */
+#define CORRELATION_EXTRA_1_OR_2(                                                                                      \
+   corr_QC,           /* I/O  corr_QC buffer. Updated 1 or 2 elements.                                              */ \
+   state_QS,          /* I    state_QS buffer.                                                                      */ \
+   offset,            /* I    The address offset of corr_QC and state_QS.                                           */ \
+   input_QS_s32x2,    /* I    2 elements of input_QS.                                                               */ \
+   warping_Q16_s32x2, /* I    Warping coefficient in all vector lanes.                                              */ \
+   tmp1_QS_s32x2X,    /* I    Either tmp1_QS_s32x2 or high half of tmp1_QS0_s32x4, with 1 or 2 elements of tmp1_QS. */ \
+   tmp2_QS_s32x2,     /* I    Either 1 or 2 elements of tmp2_QS.                                                    */ \
+   tmp1_QS_s32x2,     /* O    Updated 1 or 2 elements of tmp1_QS.                                                   */ \
+   int64xX_t,         /*      Either int64x1_t or int64x2_t.                                                        */ \
+   vget_X,            /*      Either vget_low_s64 or vget_all.                                                      */ \
+   vld1X_s64,         /*      Either vld1_s64 or vld1q_s64.                                                         */ \
+   vst1X_s64,         /*      Either vst1_s64 or vst1q_s64.                                                         */ \
+   vaddX_s64          /*      Either vadd_s64 or vaddq_s64.                                                         */ \
+)                                                                                                                      \
+{                                                                                                                      \
+   int64xX_t corr_QC_s64xX;                                                                                            \
+   int64x2_t t_s64x2;                                                                                                  \
+   corr_QC_s64xX = (vld1X_s64)(corr_QC + (offset));                                                                    \
+   t_s64x2       = vmull_s32(tmp1_QS_s32x2X, input_QS_s32x2);                                                          \
+   t_s64x2       = vshrq_n_s64(t_s64x2, 2 * QS - QC);                                                                  \
+   corr_QC_s64xX = (vaddX_s64)(corr_QC_s64xX, vget_X(t_s64x2));                                                        \
+   (vst1X_s64)(corr_QC + (offset), corr_QC_s64xX);                                                                     \
+   tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + (offset) - 1), tmp1_QS_s32x2X);                                        \
+   t_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32x2);                                                        \
+   tmp1_QS_s32x2 = vshrn_n_s64(t_s64x2, 16);                                                                           \
+   tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);                                                             \
+}
+
+/* Calculate 1 element of corr_QC. */
+#define CORRELATION_EPILOG_1(                                                                                       \
+   corr_QC,         /* I/O  corr_QC buffer. Updated 1 element.                                                   */ \
+   input_QS0_s32x4, /* I    4 elements of input_QS.                                                              */ \
+   tmp1_QS_s32xX,   /* I    Either tmp1_QS_s32x2 or low half of tmp1_QS0_s32x4, with 1 or 2 elements of tmp1_QS. */ \
+   vget_X           /*      The splitting instruction, either vget_low_s32 or vget_high_s32.                     */ \
+)                                                                                                                   \
+{                                                                                                                   \
+   int64x1_t corr_s64x1;                                                                                            \
+   int64x2_t t_s64x2;                                                                                               \
+   corr_s64x1 = vld1_s64(corr_QC);                                                                                  \
+   t_s64x2    = vmull_s32(tmp1_QS_s32xX, (vget_X)(input_QS0_s32x4));                                                \
+   t_s64x2    = vshrq_n_s64(t_s64x2, 2 * QS - QC);                                                                  \
+   corr_s64x1 = vadd_s64(corr_s64x1, vget_high_s64(t_s64x2));                                                       \
+   vst1_s64(corr_QC, corr_s64x1);                                                                                   \
+}
+
+/* Calculate 4 elements of corr_QC, state_QS and tmp1_QS in prolog. */
+#define CORRELATION_EPILOG_4(                                                                 \
+   corr_QC,           /* I/O  corr_QC buffer. Updated 4 elements.                          */ \
+   state_QS,          /* I/O  state_QS buffer. Updated 4 elements.                         */ \
+   offset,            /* I    The address offset of corr_QC and state_QS.                  */ \
+   input_QS1_s32x4,   /* I    Input_QS elements 4 to 7.                                    */ \
+   warping_Q16_s32x2, /* I    Warping coefficient in all vector lanes.                     */ \
+   tmp1_QS1_s32x4     /* I/O  4 elements of tmp1_QS.                                       */ \
+ )                                                                                            \
+ {                                                                                            \
+   int32x4_t tmp2_QS_s32x4;                                                                   \
+   int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, t0_s64x2, t1_s64x2;                              \
+   tmp2_QS_s32x4  = vld1q_s32(state_QS + (offset));                                           \
+   vst1q_s32(state_QS + (offset), tmp1_QS1_s32x4);                                            \
+   corr_QC0_s64x2 = vld1q_s64(corr_QC + (offset));                                            \
+   corr_QC1_s64x2 = vld1q_s64(corr_QC + (offset) + 2);                                        \
+   t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); \
+   t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); \
+   t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);                                       \
+   t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);                                       \
+   corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);                                      \
+   corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);                                      \
+   vst1q_s64(corr_QC + (offset),     corr_QC0_s64x2);                                         \
+   vst1q_s64(corr_QC + (offset) + 2, corr_QC1_s64x2);                                         \
+   tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + (offset) - 1), tmp1_QS1_s32x4);            \
+   t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32x2);              \
+   t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32x2);              \
+   tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));       \
+   tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS_s32x4);                                 \
+}
+
 void silk_warped_autocorrelation_FIX_neon(
          opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
          opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
@@ -61,9 +244,10 @@ void silk_warped_autocorrelation_FIX_neon(
    opus_int   n = 0, i, lsh;
    opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS];
    opus_int32 input_QS[NUM_PARALLEL_INPUTS];
-   opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two extra entries.
-   opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in the last prolog and the last inner loop, and one extra end entry in the last prolog.
+   opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two extra elements.
+   opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head element in the last prolog and the last inner loop, and one extra end element in the last prolog.
    opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+   int64x2_t  lsh_s64x2;
 
    /* Order must be even */
    silk_assert( ( order & 1 ) == 0 );
@@ -71,387 +255,138 @@ void silk_warped_autocorrelation_FIX_neon(
 
    /* Loop over samples */
    if( order >= NUM_PARALLEL_INPUTS - 2 ) {
-      const int32x2_t warping_Q16_s32 = vdup_n_s32(warping_Q16);
+      const int32x2_t warping_Q16_s32x2 = vdup_n_s32(warping_Q16);
       for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n += NUM_PARALLEL_INPUTS ) {
-         int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS0_s32x4, tmp2_QS1_s32x4;
-         int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, corr_QC2_s64x2, corr_QC3_s64x2;
-         int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2;
+         int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS1_s32x4;
          int32x2_t tmp1_QS_s32x2, tmp2_QS_s32x2;
-         int64x1_t corr_QC_s64x1;
          const int32x4_t input_QS0_s32x4 = vshll_n_s16(vld1_s16(input + n),     QS);
          const int32x4_t input_QS1_s32x4 = vshll_n_s16(vld1_s16(input + n + 4), QS);
-         vst1q_s32(tmp1_QS,      input_QS0_s32x4);
-         vst1q_s32(tmp1_QS  + 4, input_QS1_s32x4);
+         vst1q_s32(tmp1_QS,     input_QS0_s32x4);
+         vst1q_s32(tmp1_QS + 4, input_QS1_s32x4);
 
          /* Loop over allpass sections */
 
          /* -------------------- prolog 0 -------------------- */
-
-         tmp1_QS_s32x2  = vget_low_s32(input_QS0_s32x4);
-         tmp2_QS_s32x2  = vld1_s32(state_QS + order); // Accessed one extra end entry.
+         tmp1_QS_s32x2 = vget_low_s32(input_QS0_s32x4);
+         tmp2_QS_s32x2 = vld1_s32(state_QS + order); // Accessed one extra end element.
          vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
-         corr_QC_s64x1  = vld1_s64(corr_QC + order);
-         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         corr_QC_s64x1  = vadd_s64(corr_QC_s64x1, vget_low_s64(t0_s64x2));
-         vst1_s64(corr_QC + order, corr_QC_s64x1);
-         tmp1_QS_s32x2  = vsub_s32(vld1_s32(state_QS + order - 1), tmp1_QS_s32x2);
-         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
-         tmp1_QS_s32x2  = vshrn_n_s64(t0_s64x2, 16);
-         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
-         tmp1_QS_s32x2  = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1);
+         CORRELATION_PROLOG_1_OR_2(corr_QC, state_QS, order - 0, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, int64x1_t, vget_low_s64, vld1_s64,  vst1_s64,  vadd_s64)
+         tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1);
 
          /* -------------------- prolog 1 -------------------- */
-
-         tmp2_QS_s32x2  = vld1_s32(state_QS + order - 1);
+         tmp2_QS_s32x2 = vld1_s32(state_QS + order - 1);
          vst1_s32(state_QS + order - 1, tmp1_QS_s32x2);
-         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 1);
-         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-         vst1q_s64(corr_QC + order - 1, corr_QC0_s64x2);
-         tmp1_QS_s32x2  = vsub_s32(vld1_s32(state_QS + order - 2), tmp1_QS_s32x2);
-         t0_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
-         tmp1_QS_s32x2  = vshrn_n_s64(t0_s64x2, 16);
-         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+         CORRELATION_PROLOG_1_OR_2(corr_QC, state_QS, order - 1, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, int64x2_t, vget_all,     vld1q_s64, vst1q_s64, vaddq_s64)
          tmp1_QS0_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4));
 
          /* -------------------- prolog 2 -------------------- */
-
-         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry.
-         vst1q_s32(state_QS + order - 2, tmp1_QS0_s32x4);  // Saving one extra entry is OK.
-         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 2);
-         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t1_s64x2));
-         vst1q_s64(corr_QC + order - 2, corr_QC0_s64x2);
-         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
-         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS0_s32x4);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
-         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
-         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+         // Accessed one extra end element of state_QS.
+         // Saving one extra element of state_QS is OK.
+         CORRELATION_PROLOG_3_OR_4(corr_QC, state_QS, order - 2, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS0_s32x4, int64x1_t, vget_low_s64, vld1_s64, vst1_s64, vadd_s64)
          tmp1_QS0_s32x4 = vld1q_lane_s32(tmp1_QS + 3, tmp1_QS0_s32x4, 3);
 
          /* -------------------- prolog 3 -------------------- */
-
-         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 3);
-         vst1q_s32(state_QS + order - 3, tmp1_QS0_s32x4);
-         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 3);
-         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 1);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-         vst1q_s64(corr_QC + order - 3, corr_QC0_s64x2);
-         vst1q_s64(corr_QC + order - 1, corr_QC1_s64x2);
-         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 4), tmp1_QS0_s32x4);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
-         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
-         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
-         tmp1_QS_s32x2  = vget_low_s32(input_QS1_s32x4);
+         CORRELATION_4(order - 3)
+         tmp1_QS_s32x2 = vget_low_s32(input_QS1_s32x4);
 
          /* -------------------- prolog 4 -------------------- */
-
-         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 4);
-         tmp2_QS_s32x2  = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0);
-         vst1q_s32(state_QS + order - 4, tmp1_QS0_s32x4);
+         tmp2_QS_s32x2 = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0);
          vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
-         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 4);
-         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 2);
-         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
-         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_low_s32 (input_QS1_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t2_s64x2));
-         vst1q_s64(corr_QC + order - 4, corr_QC0_s64x2);
-         vst1q_s64(corr_QC + order - 2, corr_QC1_s64x2);
-         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
-         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 5), tmp1_QS0_s32x4);
-         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS + order - 1), tmp1_QS_s32x2);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
-         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 warping_Q16_s32);
-         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
-         tmp1_QS_s32x2  = vshrn_n_s64(t2_s64x2, 16);
-         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
-         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
-         tmp1_QS_s32x2  = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1);
+         CORRELATION_4(order - 4)
+         CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, order, vget_low_s32(input_QS1_s32x4), warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, tmp1_QS_s32x2, int64x1_t, vget_low_s64, vld1_s64, vst1_s64, vadd_s64)
+         tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1);
 
          /* -------------------- prolog 5 -------------------- */
-
-         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 5);
-         tmp2_QS_s32x2  = vld1_s32 (state_QS + order - 1);
-         vst1q_s32(state_QS + order - 5, tmp1_QS0_s32x4);
-         vst1_s32 (state_QS + order - 1, tmp1_QS_s32x2);
-         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 5);
-         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 3);
-         corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 1);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
-         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_low_s32 (input_QS1_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-         vst1q_s64(corr_QC + order - 5, corr_QC0_s64x2);
-         vst1q_s64(corr_QC + order - 3, corr_QC1_s64x2);
-         vst1q_s64(corr_QC + order - 1, corr_QC2_s64x2);
-         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 6), tmp1_QS0_s32x4);
-         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS + order - 2), tmp1_QS_s32x2);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
-         t2_s64x2       = vmull_s32(tmp1_QS_s32x2,                 warping_Q16_s32);
-         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
-         tmp1_QS_s32x2  = vshrn_n_s64(t2_s64x2, 16);
-         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
-         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
+         tmp2_QS_s32x2 = vld1_s32(state_QS + order - 1);
+         vst1_s32(state_QS + order - 1, tmp1_QS_s32x2);
+         CORRELATION_4(order - 5)
+         CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, order - 1, vget_low_s32(input_QS1_s32x4), warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, tmp1_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
          tmp1_QS1_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4));
 
          /* -------------------- prolog 6 -------------------- */
-
-         tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 6);
-         tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry.
-         vst1q_s32(state_QS + order - 6, tmp1_QS0_s32x4);
-         vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4);  // Saving one extra entry is OK.
-         corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 6);
-         corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 4);
-         corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 2);
-         corr_QC_s64x1  = vld1_s64 (corr_QC + order);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_low_s64(t3_s64x2));
-         vst1q_s64(corr_QC + order - 6, corr_QC0_s64x2);
-         vst1q_s64(corr_QC + order - 4, corr_QC1_s64x2);
-         vst1q_s64(corr_QC + order - 2, corr_QC2_s64x2);
-         vst1_s64 (corr_QC + order,     corr_QC_s64x1);
-         tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 7), tmp1_QS0_s32x4); // Accessed one extra head entry when order is 6.
-         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS1_s32x4);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
-         tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
-         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
-         tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
-         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end element of state_QS.
+         vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4);  // Saving one extra element of state_QS is OK.
+         // Accessed one extra head element when order is 6.
+         CORRELATION_4(order - 6)
+         CORRELATION_NEXT_3_OR_4(corr_QC, state_QS, order - 2, input_QS1_s32x4, tmp1_QS1_s32x4, tmp2_QS1_s32x4, warping_Q16_s32x2, int64x1_t, vget_low_s64, vld1_s64, vst1_s64, vadd_s64)
          tmp1_QS1_s32x4 = vld1q_lane_s32(tmp1_QS + 7, tmp1_QS1_s32x4, 3);
 
          /* -------------------- kernel loop -------------------- */
-
          for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) {
-             /* Output of allpass section */
-            tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1);
+            /* Output of allpass section */
+            // Accessed one extra head element of state_QS in the last loop.
             tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5);
-            vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1, tmp1_QS0_s32x4);
             vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5, tmp1_QS1_s32x4);
-            corr_QC0_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1);
-            corr_QC1_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3);
-            corr_QC2_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5);
-            corr_QC3_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7);
-            t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
-            t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
-            t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
-            t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
-            t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-            t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-            t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-            t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-            corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
-            corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-            corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-            corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1, corr_QC0_s64x2);
-            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3, corr_QC1_s64x2);
-            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5, corr_QC2_s64x2);
-            vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7, corr_QC3_s64x2);
-            tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS),     tmp1_QS0_s32x4); // Accessed one extra head entry in the last loop.
-            tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 4), tmp1_QS1_s32x4);
-            t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
-            t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
-            t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
-            t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
-            tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
-            tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
-            tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
-            tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+            CORRELATION_4(order - i - NUM_PARALLEL_INPUTS + 1)
+            CORRELATION_NEXT_3_OR_4(corr_QC, state_QS, order - i - NUM_PARALLEL_INPUTS + 5, input_QS1_s32x4, tmp1_QS1_s32x4, tmp2_QS1_s32x4, warping_Q16_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
          }
 
          /* -------------------- epilog 0 -------------------- */
-
-         tmp2_QS_s32x2  = vld1_s32(state_QS + 1);
-         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 3);
-         vst1q_s32(state_QS - 1, tmp1_QS0_s32x4);  // Saving one extra entry is OK.
-         vst1q_s32(state_QS + 3, tmp1_QS1_s32x4);
-         corr_QC_s64x1  = vld1_s64 (corr_QC);
-         corr_QC1_s64x2 = vld1q_s64(corr_QC + 1);
-         corr_QC2_s64x2 = vld1q_s64(corr_QC + 3);
-         corr_QC3_s64x2 = vld1q_s64(corr_QC + 5);
-         t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
-         t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
-         t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t0_s64x2));
-         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-         vst1_s64 (corr_QC + 0, corr_QC_s64x1);
-         vst1q_s64(corr_QC + 1, corr_QC1_s64x2);
-         vst1q_s64(corr_QC + 3, corr_QC2_s64x2);
-         vst1q_s64(corr_QC + 5, corr_QC3_s64x2);
-         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS),     vget_high_s32(tmp1_QS0_s32x4));
-         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 2), tmp1_QS1_s32x4);
-         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                 warping_Q16_s32);
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
-         tmp1_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
-         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
-         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
-         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         tmp2_QS_s32x2 = vld1_s32(state_QS + 1);
+         vst1q_s32(state_QS - 1, tmp1_QS0_s32x4);  // Saving one extra element is OK.
+         CORRELATION_EPILOG_1(corr_QC, input_QS0_s32x4, vget_low_s32(tmp1_QS0_s32x4), vget_low_s32)
+         CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, 1, vget_high_s32(input_QS0_s32x4), warping_Q16_s32x2, vget_high_s32(tmp1_QS0_s32x4), tmp2_QS_s32x2, tmp1_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
+         CORRELATION_EPILOG_4(corr_QC, state_QS, 3, input_QS1_s32x4, warping_Q16_s32x2, tmp1_QS1_s32x4)
 
          /* -------------------- epilog 1 -------------------- */
-
-         tmp2_QS_s32x2  = vld1_s32 (state_QS);
-         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 2);
-         vst1_s32 (state_QS,     tmp1_QS_s32x2);
-         vst1q_s32(state_QS + 2, tmp1_QS1_s32x4);
-         corr_QC1_s64x2 = vld1q_s64(corr_QC + 0);
-         corr_QC2_s64x2 = vld1q_s64(corr_QC + 2);
-         corr_QC3_s64x2 = vld1q_s64(corr_QC + 4);
-         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_high_s32(input_QS0_s32x4));
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
-         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-         vst1q_s64(corr_QC + 0, corr_QC1_s64x2);
-         vst1q_s64(corr_QC + 2, corr_QC2_s64x2);
-         vst1q_s64(corr_QC + 4, corr_QC3_s64x2);
-         tmp1_QS_s32x2  = vsub_s32 (vld1_s32 (state_QS - 1), tmp1_QS_s32x2); // Accessed one extra head entry.
-         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 1), tmp1_QS1_s32x4);
-         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                 warping_Q16_s32);
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
-         tmp1_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
-         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
-         tmp1_QS_s32x2  = vadd_s32 (tmp1_QS_s32x2,  tmp2_QS_s32x2);
-         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         tmp2_QS_s32x2 = vld1_s32(state_QS);
+         vst1_s32(state_QS, tmp1_QS_s32x2);
+         // Accessed one extra head element of state_QS.
+         CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, 0, vget_high_s32(input_QS0_s32x4), warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, tmp1_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
+         CORRELATION_EPILOG_4(corr_QC, state_QS, 2, input_QS1_s32x4, warping_Q16_s32x2, tmp1_QS1_s32x4)
 
          /* -------------------- epilog 2 -------------------- */
-
-         tmp2_QS1_s32x4 = vld1q_s32(state_QS + 1);
-         vst1_lane_s32(state_QS,     tmp1_QS_s32x2, 1);
-         vst1q_s32    (state_QS + 1, tmp1_QS1_s32x4);
-         corr_QC_s64x1  = vld1_s64(corr_QC);
-         corr_QC2_s64x2 = vld1q_s64(corr_QC + 1);
-         corr_QC3_s64x2 = vld1q_s64(corr_QC + 3);
-         t1_s64x2       = vmull_s32(tmp1_QS_s32x2,                 vget_high_s32(input_QS0_s32x4));
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
-         t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t1_s64x2));
-         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-         vst1_s64 (corr_QC + 0, corr_QC_s64x1);
-         vst1q_s64(corr_QC + 1, corr_QC2_s64x2);
-         vst1q_s64(corr_QC + 3, corr_QC3_s64x2);
-         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS), tmp1_QS1_s32x4);
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
-         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
-         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+         vst1_lane_s32(state_QS, tmp1_QS_s32x2, 1);
+         CORRELATION_EPILOG_1(corr_QC, input_QS0_s32x4, tmp1_QS_s32x2, vget_high_s32)
+         CORRELATION_EPILOG_4(corr_QC, state_QS, 1, input_QS1_s32x4, warping_Q16_s32x2, tmp1_QS1_s32x4)
 
          /* -------------------- epilog 3 -------------------- */
-
-         tmp2_QS1_s32x4 = vld1q_s32(state_QS);
-         vst1q_s32(state_QS, tmp1_QS1_s32x4);
-         corr_QC2_s64x2 = vld1q_s64(corr_QC);
-         corr_QC3_s64x2 = vld1q_s64(corr_QC + 2);
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
-         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-         vst1q_s64(corr_QC,     corr_QC2_s64x2);
-         vst1q_s64(corr_QC + 2, corr_QC3_s64x2);
-         tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS - 1), tmp1_QS1_s32x4); // Accessed one extra head entry.
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
-         tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
-         tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
-
-         /* -------------------- epilog 4 -------------------- */
-
-         corr_QC_s64x1  = vld1_s64 (corr_QC);
-         corr_QC3_s64x2 = vld1q_s64(corr_QC + 1);
-         t2_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
-         t3_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
-         t2_s64x2       = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1,  vget_high_s64(t2_s64x2));
-         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-         vst1_s64 (corr_QC,     corr_QC_s64x1);
-         vst1q_s64(corr_QC + 1, corr_QC3_s64x2);
-         vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4);
-
-         tmp2_QS_s32x2  = vld1_s32(state_QS + 1);
-         tmp1_QS_s32x2  = vsub_s32(vld1_s32(tmp1_QS + 5), vget_high_s32(tmp1_QS1_s32x4));
-         t3_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
-         tmp1_QS_s32x2  = vshrn_n_s64(t3_s64x2, 16);
-         tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
-         vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1);
-
-         /* -------------------- epilog 5 & 6 -------------------- */
-
-         vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1);
-         tmp2_QS_s32x2  = vsub_s32(tmp1_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32)));
-         t3_s64x2       = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32);
-         tmp2_QS_s32x2  = vshrn_n_s64(t3_s64x2, 16);
-         tmp2_QS_s32x2  = vadd_s32(vget_high_s32(tmp1_QS1_s32x4), tmp2_QS_s32x2);
-         vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0);
-
-         corr_QC3_s64x2 = vld1q_s64(corr_QC);
-         t3_s64x2       = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4));
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
-         vst1_s64(corr_QC + 1, vget_high_s64(corr_QC3_s64x2));
-         t3_s64x2       = vmull_s32(tmp2_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)), 32)));
-         t3_s64x2       = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
-         corr_QC_s64x1  = vadd_s64(vget_low_s64(corr_QC3_s64x2), vget_low_s64(t3_s64x2));
-         vst1_s64(corr_QC, corr_QC_s64x1);
+         // Accessed one extra head element of state_QS.
+         CORRELATION_EPILOG_4(corr_QC, state_QS, 0, input_QS1_s32x4, warping_Q16_s32x2, tmp1_QS1_s32x4)
+
+         {
+            int64x1_t corr_QC_s64x1;
+            int64x2_t corr_QC0_s64x2;
+            int64x2_t t0_s64x2, t1_s64x2;
+
+            /* -------------------- epilog 4 -------------------- */
+            corr_QC_s64x1  = vld1_s64 (corr_QC);
+            corr_QC0_s64x2 = vld1q_s64(corr_QC + 1);
+            t0_s64x2       = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+            t1_s64x2       = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+            t0_s64x2       = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+            t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+            corr_QC_s64x1  = vadd_s64 (corr_QC_s64x1, vget_high_s64(t0_s64x2));
+            corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t1_s64x2);
+            vst1_s64 (corr_QC,     corr_QC_s64x1);
+            vst1q_s64(corr_QC + 1, corr_QC0_s64x2);
+            vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4);
+
+            tmp2_QS_s32x2  = vld1_s32(state_QS + 1);
+            tmp1_QS_s32x2  = vsub_s32(vld1_s32(tmp1_QS + 5), vget_high_s32(tmp1_QS1_s32x4));
+            t1_s64x2       = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32x2);
+            tmp1_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
+            tmp1_QS_s32x2  = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+            vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1);
+
+            /* -------------------- epilog 5 & 6 -------------------- */
+            vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1);
+            tmp2_QS_s32x2  = vsub_s32(tmp1_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32)));
+            t1_s64x2       = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32x2);
+            tmp2_QS_s32x2  = vshrn_n_s64(t1_s64x2, 16);
+            tmp2_QS_s32x2  = vadd_s32(vget_high_s32(tmp1_QS1_s32x4), tmp2_QS_s32x2);
+            vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0);
+
+            corr_QC0_s64x2 = vld1q_s64(corr_QC);
+            t1_s64x2       = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4));
+            t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+            corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t1_s64x2);
+            vst1_s64(corr_QC + 1, vget_high_s64(corr_QC0_s64x2));
+            t1_s64x2       = vmull_s32(tmp2_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)), 32)));
+            t1_s64x2       = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+            corr_QC_s64x1  = vadd_s64(vget_low_s64(corr_QC0_s64x2), vget_low_s64(t1_s64x2));
+            vst1_s64(corr_QC, corr_QC_s64x1);
+         }
       }
    }
 
@@ -470,14 +405,16 @@ void silk_warped_autocorrelation_FIX_neon(
    lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
    *scale = -( QC + lsh );
    silk_assert( *scale >= -30 && *scale <= 12 );
-   const int64x2_t lsh_s64x2 = vdupq_n_s64(lsh);
+   lsh_s64x2 = vdupq_n_s64(lsh);
    for( i = 0; i <= order - 3; i += 4 ) {
-      int64x2_t corr_QC0_s64x2 = vld1q_s64(corr_QC + i);
-      int64x2_t corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2);
-      corr_QC0_s64x2           = vshlq_s64(corr_QC0_s64x2, lsh_s64x2);
-      corr_QC1_s64x2           = vshlq_s64(corr_QC1_s64x2, lsh_s64x2);
-      int32x4_t corr_s32x4     = vcombine_s32(vmovn_s64(corr_QC1_s64x2), vmovn_s64(corr_QC0_s64x2));
-      corr_s32x4               = vrev64q_s32(corr_s32x4);
+      int32x4_t corr_s32x4;
+      int64x2_t corr_QC0_s64x2, corr_QC1_s64x2;
+      corr_QC0_s64x2 = vld1q_s64(corr_QC + i);
+      corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2);
+      corr_QC0_s64x2 = vshlq_s64(corr_QC0_s64x2, lsh_s64x2);
+      corr_QC1_s64x2 = vshlq_s64(corr_QC1_s64x2, lsh_s64x2);
+      corr_s32x4     = vcombine_s32(vmovn_s64(corr_QC1_s64x2), vmovn_s64(corr_QC0_s64x2));
+      corr_s32x4     = vrev64q_s32(corr_s32x4);
       vst1q_s32(corr + order - i - 3, corr_s32x4);
    }
    if( lsh >= 0 ) {
-- 
2.8.0.rc3.226.g39d4020