[opus] [PATCH 4/5] Refactor silk_warped_autocorrelation_FIX_neon()
Linfeng Zhang
linfengz at google.com
Thu Jul 14 00:49:01 UTC 2016
Clean the code by defining macros.
---
.../arm/warped_autocorrelation_FIX_neon_intr.c | 637 ++++++++++-----------
1 file changed, 287 insertions(+), 350 deletions(-)
diff --git a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
index 80dd949..6071445 100644
--- a/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
+++ b/silk/fixed/arm/warped_autocorrelation_FIX_neon_intr.c
@@ -40,7 +40,6 @@
#endif
#include <stdio.h>
-
#include <arm_neon.h>
#include "stack_alloc.h"
#include "main_FIX.h"
@@ -49,6 +48,190 @@
#define NUM_PARALLEL_INPUTS 8
+#define vget_all(x) (x)
+
+/* Calculate 1 or 2 elements of corr_QC and tmp1_QS in prolog. */
+#define CORRELATION_PROLOG_1_OR_2( \
+ corr_QC, /* I/O corr_QC buffer. Updated 1 or 2 elements. */ \
+ state_QS, /* I state_QS buffer. */ \
+ offset, /* I The address offset of corr_QC and state_QS. */ \
+ input_QS0_s32x4, /* I Input_QS elements 0 to 3. */ \
+ warping_Q16_s32x2, /* I Warping coefficient in all vector lanes. */ \
+ tmp1_QS_s32x2, /* I/O Either 1 or 2 elements of tmp1_QS. */ \
+ tmp2_QS_s32x2, /* I Either 1 or 2 elements of tmp2_QS. */ \
+ int64xX_t, /* Either int64x1_t or int64x2_t. */ \
+ vget_X, /* Either vget_low_s64 or vget_all. */ \
+ vld1X_s64, /* Either vld1_s64 or vld1q_s64. */ \
+ vst1X_s64, /* Either vst1_s64 or vst1q_s64. */ \
+ vaddX_s64 /* Either vadd_s64 or vaddq_s64. */ \
+) \
+{ \
+ int64xX_t corr_QC_s64xX; \
+ int64x2_t t_s64x2; \
+ corr_QC_s64xX = (vld1X_s64)(corr_QC + (offset)); \
+ t_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4)); \
+ t_s64x2 = vshrq_n_s64(t_s64x2, 2 * QS - QC); \
+ corr_QC_s64xX = (vaddX_s64)(corr_QC_s64xX, vget_X(t_s64x2)); \
+ (vst1X_s64)(corr_QC + (offset), corr_QC_s64xX); \
+ tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + (offset) - 1), tmp1_QS_s32x2); \
+ t_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32x2); \
+ tmp1_QS_s32x2 = vshrn_n_s64(t_s64x2, 16); \
+ tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2); \
+}
+
+/* Calculate 3 or 4 elements of corr_QC, state_QS and tmp1_QS in prolog. */
+#define CORRELATION_PROLOG_3_OR_4( \
+ corr_QC, /* I/O corr_QC buffer. Updated 3 or 4 elements. */ \
+ state_QS, /* I/O state_QS buffer. Updated 4 elements. */ \
+ offset, /* I The address offset of corr_QC and state_QS. */ \
+ input_QS0_s32x4, /* I Input_QS elements 0 to 3. */ \
+ warping_Q16_s32x2, /* I Warping coefficient in all vector lanes. */ \
+ tmp1_QS0_s32x4, /* O Updated 3 or 4 elements of tmp1_QS. */ \
+ int64xX_t, /* Either int64x1_t or int64x2_t. */ \
+ vget_X, /* Either vget_low_s64 or vget_all. */ \
+ vld1X_s64, /* Either vld1_s64 or vld1q_s64. */ \
+ vst1X_s64, /* Either vst1_s64 or vst1q_s64. */ \
+ vaddX_s64 /* Either vadd_s64 or vaddq_s64. */ \
+) \
+{ \
+ int32x4_t tmp2_QS_s32x4; \
+ int64x2_t corr_QC0_s64x2, t0_s64x2, t1_s64x2; \
+ int64xX_t corr_QC_s64xX; \
+ tmp2_QS_s32x4 = vld1q_s32(state_QS + (offset)); \
+ vst1q_s32(state_QS + (offset), tmp1_QS0_s32x4); \
+ corr_QC0_s64x2 = vld1q_s64 (corr_QC + (offset)); \
+ corr_QC_s64xX = (vld1X_s64)(corr_QC + (offset) + 2); \
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4)); \
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4)); \
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); \
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); \
+ corr_QC0_s64x2 = vaddq_s64 (corr_QC0_s64x2, t0_s64x2); \
+ corr_QC_s64xX = (vaddX_s64)(corr_QC_s64xX, vget_X(t1_s64x2)); \
+ vst1q_s64 (corr_QC + (offset), corr_QC0_s64x2); \
+ (vst1X_s64)(corr_QC + (offset) + 2, corr_QC_s64xX); \
+ tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + (offset) - 1), tmp1_QS0_s32x4); \
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32x2); \
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32x2); \
+ tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); \
+ tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS_s32x4); \
+}
+
+/* Calculate 4 elements of corr_QC, state_QS and tmp1_QS in prolog and kernel loop. */
+#define CORRELATION_4(offset) CORRELATION_PROLOG_3_OR_4(corr_QC, state_QS, offset, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS0_s32x4, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
+
+/* Calculate 3 or 4 elements of corr_QC and tmp1_QS. */
+#define CORRELATION_NEXT_3_OR_4( \
+ corr_QC, /* I/O corr_QC buffer. Updated 3 or 4 elements. */ \
+ state_QS, /* I state_QS buffer. */ \
+ offset, /* I The address offset of corr_QC and state_QS. */ \
+ input_QS1_s32x4, /* I 4 elements of input_QS. */ \
+ tmp1_QS1_s32x4, /* I/O Either 3 or 4 elements of tmp1_QS. */ \
+ tmp2_QS1_s32x4, /* I Either 3 or 4 elements of tmp2_QS. */ \
+ warping_Q16_s32x2, /* I Warping coefficient in all vector lanes. */ \
+ int64xX_t, /* Either int64x1_t or int64x2_t. */ \
+ vget_X, /* Either vget_low_s64 or vget_all. */ \
+ vld1X_s64, /* Either vld1_s64 or vld1q_s64. */ \
+ vst1X_s64, /* Either vst1_s64 or vst1q_s64. */ \
+ vaddX_s64 /* Either vadd_s64 or vaddq_s64. */ \
+) \
+{ \
+ int64x2_t corr_QC0_s64x2, t0_s64x2, t1_s64x2; \
+ int64xX_t corr_QC_s64xX; \
+ corr_QC0_s64x2 = vld1q_s64 (corr_QC + (offset)); \
+ corr_QC_s64xX = (vld1X_s64)(corr_QC + (offset) + 2); \
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); \
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); \
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); \
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); \
+ corr_QC0_s64x2 = vaddq_s64 (corr_QC0_s64x2, t0_s64x2); \
+ corr_QC_s64xX = (vaddX_s64)(corr_QC_s64xX, vget_X(t1_s64x2)); \
+ vst1q_s64 (corr_QC + (offset), corr_QC0_s64x2); \
+ (vst1X_s64)(corr_QC + (offset) + 2, corr_QC_s64xX); \
+ tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + (offset) - 1), tmp1_QS1_s32x4); \
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32x2); \
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32x2); \
+ tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); \
+ tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4); \
+}
+
+/* Calculate 1 or 2 elements of corr_QC and tmp1_QS. */
+#define CORRELATION_EXTRA_1_OR_2( \
+ corr_QC, /* I/O corr_QC buffer. Updated 1 or 2 elements. */ \
+ state_QS, /* I state_QS buffer. */ \
+ offset, /* I The address offset of corr_QC and state_QS. */ \
+ input_QS_s32x2, /* I 2 elements of input_QS. */ \
+ warping_Q16_s32x2, /* I Warping coefficient in all vector lanes. */ \
+ tmp1_QS_s32x2X, /* I Either tmp1_QS_s32x2 or high half of tmp1_QS0_s32x4, with 1 or 2 elements of tmp1_QS. */ \
+ tmp2_QS_s32x2, /* I Either 1 or 2 elements of tmp2_QS. */ \
+ tmp1_QS_s32x2, /* O Updated 1 or 2 elements of tmp1_QS. */ \
+ int64xX_t, /* Either int64x1_t or int64x2_t. */ \
+ vget_X, /* Either vget_low_s64 or vget_all. */ \
+ vld1X_s64, /* Either vld1_s64 or vld1q_s64. */ \
+ vst1X_s64, /* Either vst1_s64 or vst1q_s64. */ \
+ vaddX_s64 /* Either vadd_s64 or vaddq_s64. */ \
+) \
+{ \
+ int64xX_t corr_QC_s64xX; \
+ int64x2_t t_s64x2; \
+ corr_QC_s64xX = (vld1X_s64)(corr_QC + (offset)); \
+ t_s64x2 = vmull_s32(tmp1_QS_s32x2X, input_QS_s32x2); \
+ t_s64x2 = vshrq_n_s64(t_s64x2, 2 * QS - QC); \
+ corr_QC_s64xX = (vaddX_s64)(corr_QC_s64xX, vget_X(t_s64x2)); \
+ (vst1X_s64)(corr_QC + (offset), corr_QC_s64xX); \
+ tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + (offset) - 1), tmp1_QS_s32x2X); \
+ t_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32x2); \
+ tmp1_QS_s32x2 = vshrn_n_s64(t_s64x2, 16); \
+ tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2); \
+}
+
+/* Calculate 1 element of corr_QC. */
+#define CORRELATION_EPILOG_1( \
+ corr_QC, /* I/O corr_QC buffer. Updated 1 element. */ \
+ input_QS0_s32x4, /* I 4 elements of input_QS. */ \
+ tmp1_QS_s32xX, /* I Either tmp1_QS_s32x2 or low half of tmp1_QS0_s32x4, with 1 or 2 elements of tmp1_QS. */ \
+ vget_X /* The splitting instruction, either vget_low_s32 or vget_high_s32. */ \
+) \
+{ \
+ int64x1_t corr_s64x1; \
+ int64x2_t t_s64x2; \
+ corr_s64x1 = vld1_s64(corr_QC); \
+ t_s64x2 = vmull_s32(tmp1_QS_s32xX, (vget_X)(input_QS0_s32x4)); \
+ t_s64x2 = vshrq_n_s64(t_s64x2, 2 * QS - QC); \
+ corr_s64x1 = vadd_s64(corr_s64x1, vget_high_s64(t_s64x2)); \
+ vst1_s64(corr_QC, corr_s64x1); \
+}
+
+/* Calculate 4 elements of corr_QC, state_QS and tmp1_QS in prolog. */
+#define CORRELATION_EPILOG_4( \
+ corr_QC, /* I/O corr_QC buffer. Updated 4 elements. */ \
+ state_QS, /* I/O state_QS buffer. Updated 4 elements. */ \
+ offset, /* I The address offset of corr_QC and state_QS. */ \
+ input_QS1_s32x4, /* I Input_QS elements 4 to 7. */ \
+ warping_Q16_s32x2, /* I Warping coefficient in all vector lanes. */ \
+ tmp1_QS1_s32x4 /* I/O 4 elements of tmp1_QS. */ \
+ ) \
+ { \
+ int32x4_t tmp2_QS_s32x4; \
+ int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, t0_s64x2, t1_s64x2; \
+ tmp2_QS_s32x4 = vld1q_s32(state_QS + (offset)); \
+ vst1q_s32(state_QS + (offset), tmp1_QS1_s32x4); \
+ corr_QC0_s64x2 = vld1q_s64(corr_QC + (offset)); \
+ corr_QC1_s64x2 = vld1q_s64(corr_QC + (offset) + 2); \
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4)); \
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4)); \
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC); \
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC); \
+ corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2); \
+ corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2); \
+ vst1q_s64(corr_QC + (offset), corr_QC0_s64x2); \
+ vst1q_s64(corr_QC + (offset) + 2, corr_QC1_s64x2); \
+ tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + (offset) - 1), tmp1_QS1_s32x4); \
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32x2); \
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32x2); \
+ tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16)); \
+ tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS_s32x4); \
+}
+
void silk_warped_autocorrelation_FIX_neon(
opus_int32 *corr, /* O Result [order + 1] */
opus_int *scale, /* O Scaling of the correlation vector */
@@ -61,9 +244,10 @@ void silk_warped_autocorrelation_FIX_neon(
opus_int n = 0, i, lsh;
opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS];
opus_int32 input_QS[NUM_PARALLEL_INPUTS];
- opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two extra entries.
- opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in the last prolog and the last inner loop, and one extra end entry in the last prolog.
+ opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two extra elements.
+ opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head element in the last prolog and the last inner loop, and one extra end element in the last prolog.
opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+ int64x2_t lsh_s64x2;
/* Order must be even */
silk_assert( ( order & 1 ) == 0 );
@@ -71,387 +255,138 @@ void silk_warped_autocorrelation_FIX_neon(
/* Loop over samples */
if( order >= NUM_PARALLEL_INPUTS - 2 ) {
- const int32x2_t warping_Q16_s32 = vdup_n_s32(warping_Q16);
+ const int32x2_t warping_Q16_s32x2 = vdup_n_s32(warping_Q16);
for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n += NUM_PARALLEL_INPUTS ) {
- int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS0_s32x4, tmp2_QS1_s32x4;
- int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, corr_QC2_s64x2, corr_QC3_s64x2;
- int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2;
+ int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS1_s32x4;
int32x2_t tmp1_QS_s32x2, tmp2_QS_s32x2;
- int64x1_t corr_QC_s64x1;
const int32x4_t input_QS0_s32x4 = vshll_n_s16(vld1_s16(input + n), QS);
const int32x4_t input_QS1_s32x4 = vshll_n_s16(vld1_s16(input + n + 4), QS);
- vst1q_s32(tmp1_QS, input_QS0_s32x4);
- vst1q_s32(tmp1_QS + 4, input_QS1_s32x4);
+ vst1q_s32(tmp1_QS, input_QS0_s32x4);
+ vst1q_s32(tmp1_QS + 4, input_QS1_s32x4);
/* Loop over allpass sections */
/* -------------------- prolog 0 -------------------- */
-
- tmp1_QS_s32x2 = vget_low_s32(input_QS0_s32x4);
- tmp2_QS_s32x2 = vld1_s32(state_QS + order); // Accessed one extra end entry.
+ tmp1_QS_s32x2 = vget_low_s32(input_QS0_s32x4);
+ tmp2_QS_s32x2 = vld1_s32(state_QS + order); // Accessed one extra end element.
vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
- corr_QC_s64x1 = vld1_s64(corr_QC + order);
- t0_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4));
- t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
- corr_QC_s64x1 = vadd_s64(corr_QC_s64x1, vget_low_s64(t0_s64x2));
- vst1_s64(corr_QC + order, corr_QC_s64x1);
- tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + order - 1), tmp1_QS_s32x2);
- t0_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
- tmp1_QS_s32x2 = vshrn_n_s64(t0_s64x2, 16);
- tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
- tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1);
+ CORRELATION_PROLOG_1_OR_2(corr_QC, state_QS, order - 0, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, int64x1_t, vget_low_s64, vld1_s64, vst1_s64, vadd_s64)
+ tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1);
/* -------------------- prolog 1 -------------------- */
-
- tmp2_QS_s32x2 = vld1_s32(state_QS + order - 1);
+ tmp2_QS_s32x2 = vld1_s32(state_QS + order - 1);
vst1_s32(state_QS + order - 1, tmp1_QS_s32x2);
- corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 1);
- t0_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4));
- t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
- corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
- vst1q_s64(corr_QC + order - 1, corr_QC0_s64x2);
- tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + order - 2), tmp1_QS_s32x2);
- t0_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
- tmp1_QS_s32x2 = vshrn_n_s64(t0_s64x2, 16);
- tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+ CORRELATION_PROLOG_1_OR_2(corr_QC, state_QS, order - 1, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
tmp1_QS0_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4));
/* -------------------- prolog 2 -------------------- */
-
- tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry.
- vst1q_s32(state_QS + order - 2, tmp1_QS0_s32x4); // Saving one extra entry is OK.
- corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 2);
- corr_QC_s64x1 = vld1_s64 (corr_QC + order);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
- t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
- t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
- corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
- corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_low_s64(t1_s64x2));
- vst1q_s64(corr_QC + order - 2, corr_QC0_s64x2);
- vst1_s64 (corr_QC + order, corr_QC_s64x1);
- tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS0_s32x4);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
- tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
- tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+ // Accessed one extra end element of state_QS.
+ // Saving one extra element of state_QS is OK.
+ CORRELATION_PROLOG_3_OR_4(corr_QC, state_QS, order - 2, input_QS0_s32x4, warping_Q16_s32x2, tmp1_QS0_s32x4, int64x1_t, vget_low_s64, vld1_s64, vst1_s64, vadd_s64)
tmp1_QS0_s32x4 = vld1q_lane_s32(tmp1_QS + 3, tmp1_QS0_s32x4, 3);
/* -------------------- prolog 3 -------------------- */
-
- tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 3);
- vst1q_s32(state_QS + order - 3, tmp1_QS0_s32x4);
- corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 3);
- corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 1);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
- t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
- t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
- corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
- corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
- vst1q_s64(corr_QC + order - 3, corr_QC0_s64x2);
- vst1q_s64(corr_QC + order - 1, corr_QC1_s64x2);
- tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 4), tmp1_QS0_s32x4);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
- tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
- tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
- tmp1_QS_s32x2 = vget_low_s32(input_QS1_s32x4);
+ CORRELATION_4(order - 3)
+ tmp1_QS_s32x2 = vget_low_s32(input_QS1_s32x4);
/* -------------------- prolog 4 -------------------- */
-
- tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 4);
- tmp2_QS_s32x2 = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0);
- vst1q_s32(state_QS + order - 4, tmp1_QS0_s32x4);
+ tmp2_QS_s32x2 = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0);
vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
- corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 4);
- corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 2);
- corr_QC_s64x1 = vld1_s64 (corr_QC + order);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
- t2_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32 (input_QS1_s32x4));
- t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
- t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
- t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
- corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
- corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
- corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_low_s64(t2_s64x2));
- vst1q_s64(corr_QC + order - 4, corr_QC0_s64x2);
- vst1q_s64(corr_QC + order - 2, corr_QC1_s64x2);
- vst1_s64 (corr_QC + order, corr_QC_s64x1);
- tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 5), tmp1_QS0_s32x4);
- tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS + order - 1), tmp1_QS_s32x2);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
- t2_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
- tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
- tmp1_QS_s32x2 = vshrn_n_s64(t2_s64x2, 16);
- tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
- tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2);
- tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1);
+ CORRELATION_4(order - 4)
+ CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, order, vget_low_s32(input_QS1_s32x4), warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, tmp1_QS_s32x2, int64x1_t, vget_low_s64, vld1_s64, vst1_s64, vadd_s64)
+ tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1);
/* -------------------- prolog 5 -------------------- */
-
- tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 5);
- tmp2_QS_s32x2 = vld1_s32 (state_QS + order - 1);
- vst1q_s32(state_QS + order - 5, tmp1_QS0_s32x4);
- vst1_s32 (state_QS + order - 1, tmp1_QS_s32x2);
- corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 5);
- corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 3);
- corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 1);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
- t2_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32 (input_QS1_s32x4));
- t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
- t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
- t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
- corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
- corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
- corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
- vst1q_s64(corr_QC + order - 5, corr_QC0_s64x2);
- vst1q_s64(corr_QC + order - 3, corr_QC1_s64x2);
- vst1q_s64(corr_QC + order - 1, corr_QC2_s64x2);
- tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 6), tmp1_QS0_s32x4);
- tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS + order - 2), tmp1_QS_s32x2);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
- t2_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
- tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
- tmp1_QS_s32x2 = vshrn_n_s64(t2_s64x2, 16);
- tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
- tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2);
+ tmp2_QS_s32x2 = vld1_s32(state_QS + order - 1);
+ vst1_s32(state_QS + order - 1, tmp1_QS_s32x2);
+ CORRELATION_4(order - 5)
+ CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, order - 1, vget_low_s32(input_QS1_s32x4), warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, tmp1_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
tmp1_QS1_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4));
/* -------------------- prolog 6 -------------------- */
-
- tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 6);
- tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry.
- vst1q_s32(state_QS + order - 6, tmp1_QS0_s32x4);
- vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4); // Saving one extra entry is OK.
- corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 6);
- corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 4);
- corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 2);
- corr_QC_s64x1 = vld1_s64 (corr_QC + order);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
- t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
- t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
- t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
- t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
- corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
- corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
- corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
- corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_low_s64(t3_s64x2));
- vst1q_s64(corr_QC + order - 6, corr_QC0_s64x2);
- vst1q_s64(corr_QC + order - 4, corr_QC1_s64x2);
- vst1q_s64(corr_QC + order - 2, corr_QC2_s64x2);
- vst1_s64 (corr_QC + order, corr_QC_s64x1);
- tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 7), tmp1_QS0_s32x4); // Accessed one extra head entry when order is 6.
- tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS1_s32x4);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
- tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
- tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
- tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
- tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+ tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end element of state_QS.
+ vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4); // Saving one extra element of state_QS is OK.
+ // Accessed one extra head element when order is 6.
+ CORRELATION_4(order - 6)
+ CORRELATION_NEXT_3_OR_4(corr_QC, state_QS, order - 2, input_QS1_s32x4, tmp1_QS1_s32x4, tmp2_QS1_s32x4, warping_Q16_s32x2, int64x1_t, vget_low_s64, vld1_s64, vst1_s64, vadd_s64)
tmp1_QS1_s32x4 = vld1q_lane_s32(tmp1_QS + 7, tmp1_QS1_s32x4, 3);
/* -------------------- kernel loop -------------------- */
-
for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) {
- /* Output of allpass section */
- tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1);
+ /* Output of allpass section */
+ // Accessed one extra head element of state_QS in the last loop.
tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5);
- vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1, tmp1_QS0_s32x4);
vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5, tmp1_QS1_s32x4);
- corr_QC0_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1);
- corr_QC1_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3);
- corr_QC2_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5);
- corr_QC3_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
- t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
- t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
- t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
- t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
- corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
- corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
- corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
- corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
- vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1, corr_QC0_s64x2);
- vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3, corr_QC1_s64x2);
- vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5, corr_QC2_s64x2);
- vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7, corr_QC3_s64x2);
- tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS), tmp1_QS0_s32x4); // Accessed one extra head entry in the last loop.
- tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 4), tmp1_QS1_s32x4);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
- tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
- tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
- tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
- tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+ CORRELATION_4(order - i - NUM_PARALLEL_INPUTS + 1)
+ CORRELATION_NEXT_3_OR_4(corr_QC, state_QS, order - i - NUM_PARALLEL_INPUTS + 5, input_QS1_s32x4, tmp1_QS1_s32x4, tmp2_QS1_s32x4, warping_Q16_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
}
/* -------------------- epilog 0 -------------------- */
-
- tmp2_QS_s32x2 = vld1_s32(state_QS + 1);
- tmp2_QS1_s32x4 = vld1q_s32(state_QS + 3);
- vst1q_s32(state_QS - 1, tmp1_QS0_s32x4); // Saving one extra entry is OK.
- vst1q_s32(state_QS + 3, tmp1_QS1_s32x4);
- corr_QC_s64x1 = vld1_s64 (corr_QC);
- corr_QC1_s64x2 = vld1q_s64(corr_QC + 1);
- corr_QC2_s64x2 = vld1q_s64(corr_QC + 3);
- corr_QC3_s64x2 = vld1q_s64(corr_QC + 5);
- t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
- t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
- t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
- t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
- t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
- t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
- corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t0_s64x2));
- corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
- corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
- corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
- vst1_s64 (corr_QC + 0, corr_QC_s64x1);
- vst1q_s64(corr_QC + 1, corr_QC1_s64x2);
- vst1q_s64(corr_QC + 3, corr_QC2_s64x2);
- vst1q_s64(corr_QC + 5, corr_QC3_s64x2);
- tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS), vget_high_s32(tmp1_QS0_s32x4));
- tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 2), tmp1_QS1_s32x4);
- t1_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
- tmp1_QS_s32x2 = vshrn_n_s64(t1_s64x2, 16);
- tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
- tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2);
- tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+ tmp2_QS_s32x2 = vld1_s32(state_QS + 1);
+ vst1q_s32(state_QS - 1, tmp1_QS0_s32x4); // Saving one extra element is OK.
+ CORRELATION_EPILOG_1(corr_QC, input_QS0_s32x4, vget_low_s32(tmp1_QS0_s32x4), vget_low_s32)
+ CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, 1, vget_high_s32(input_QS0_s32x4), warping_Q16_s32x2, vget_high_s32(tmp1_QS0_s32x4), tmp2_QS_s32x2, tmp1_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
+ CORRELATION_EPILOG_4(corr_QC, state_QS, 3, input_QS1_s32x4, warping_Q16_s32x2, tmp1_QS1_s32x4)
/* -------------------- epilog 1 -------------------- */
-
- tmp2_QS_s32x2 = vld1_s32 (state_QS);
- tmp2_QS1_s32x4 = vld1q_s32(state_QS + 2);
- vst1_s32 (state_QS, tmp1_QS_s32x2);
- vst1q_s32(state_QS + 2, tmp1_QS1_s32x4);
- corr_QC1_s64x2 = vld1q_s64(corr_QC + 0);
- corr_QC2_s64x2 = vld1q_s64(corr_QC + 2);
- corr_QC3_s64x2 = vld1q_s64(corr_QC + 4);
- t1_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4));
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
- t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
- t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
- t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
- corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
- corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
- corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
- vst1q_s64(corr_QC + 0, corr_QC1_s64x2);
- vst1q_s64(corr_QC + 2, corr_QC2_s64x2);
- vst1q_s64(corr_QC + 4, corr_QC3_s64x2);
- tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS - 1), tmp1_QS_s32x2); // Accessed one extra head entry.
- tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 1), tmp1_QS1_s32x4);
- t1_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
- tmp1_QS_s32x2 = vshrn_n_s64(t1_s64x2, 16);
- tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
- tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2);
- tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+ tmp2_QS_s32x2 = vld1_s32(state_QS);
+ vst1_s32(state_QS, tmp1_QS_s32x2);
+ // Accessed one extra head element of state_QS.
+ CORRELATION_EXTRA_1_OR_2(corr_QC, state_QS, 0, vget_high_s32(input_QS0_s32x4), warping_Q16_s32x2, tmp1_QS_s32x2, tmp2_QS_s32x2, tmp1_QS_s32x2, int64x2_t, vget_all, vld1q_s64, vst1q_s64, vaddq_s64)
+ CORRELATION_EPILOG_4(corr_QC, state_QS, 2, input_QS1_s32x4, warping_Q16_s32x2, tmp1_QS1_s32x4)
/* -------------------- epilog 2 -------------------- */
-
- tmp2_QS1_s32x4 = vld1q_s32(state_QS + 1);
- vst1_lane_s32(state_QS, tmp1_QS_s32x2, 1);
- vst1q_s32 (state_QS + 1, tmp1_QS1_s32x4);
- corr_QC_s64x1 = vld1_s64(corr_QC);
- corr_QC2_s64x2 = vld1q_s64(corr_QC + 1);
- corr_QC3_s64x2 = vld1q_s64(corr_QC + 3);
- t1_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4));
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
- t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
- t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
- t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
- corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t1_s64x2));
- corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
- corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
- vst1_s64 (corr_QC + 0, corr_QC_s64x1);
- vst1q_s64(corr_QC + 1, corr_QC2_s64x2);
- vst1q_s64(corr_QC + 3, corr_QC3_s64x2);
- tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS), tmp1_QS1_s32x4);
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
- tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
- tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+ vst1_lane_s32(state_QS, tmp1_QS_s32x2, 1);
+ CORRELATION_EPILOG_1(corr_QC, input_QS0_s32x4, tmp1_QS_s32x2, vget_high_s32)
+ CORRELATION_EPILOG_4(corr_QC, state_QS, 1, input_QS1_s32x4, warping_Q16_s32x2, tmp1_QS1_s32x4)
/* -------------------- epilog 3 -------------------- */
-
- tmp2_QS1_s32x4 = vld1q_s32(state_QS);
- vst1q_s32(state_QS, tmp1_QS1_s32x4);
- corr_QC2_s64x2 = vld1q_s64(corr_QC);
- corr_QC3_s64x2 = vld1q_s64(corr_QC + 2);
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
- t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
- t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
- corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
- corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
- vst1q_s64(corr_QC, corr_QC2_s64x2);
- vst1q_s64(corr_QC + 2, corr_QC3_s64x2);
- tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS - 1), tmp1_QS1_s32x4); // Accessed one extra head entry.
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
- tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
- tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
-
- /* -------------------- epilog 4 -------------------- */
-
- corr_QC_s64x1 = vld1_s64 (corr_QC);
- corr_QC3_s64x2 = vld1q_s64(corr_QC + 1);
- t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
- t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
- t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
- t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
- corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t2_s64x2));
- corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
- vst1_s64 (corr_QC, corr_QC_s64x1);
- vst1q_s64(corr_QC + 1, corr_QC3_s64x2);
- vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4);
-
- tmp2_QS_s32x2 = vld1_s32(state_QS + 1);
- tmp1_QS_s32x2 = vsub_s32(vld1_s32(tmp1_QS + 5), vget_high_s32(tmp1_QS1_s32x4));
- t3_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
- tmp1_QS_s32x2 = vshrn_n_s64(t3_s64x2, 16);
- tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
- vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1);
-
- /* -------------------- epilog 5 & 6 -------------------- */
-
- vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1);
- tmp2_QS_s32x2 = vsub_s32(tmp1_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32)));
- t3_s64x2 = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32);
- tmp2_QS_s32x2 = vshrn_n_s64(t3_s64x2, 16);
- tmp2_QS_s32x2 = vadd_s32(vget_high_s32(tmp1_QS1_s32x4), tmp2_QS_s32x2);
- vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0);
-
- corr_QC3_s64x2 = vld1q_s64(corr_QC);
- t3_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4));
- t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
- corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
- vst1_s64(corr_QC + 1, vget_high_s64(corr_QC3_s64x2));
- t3_s64x2 = vmull_s32(tmp2_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)), 32)));
- t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
- corr_QC_s64x1 = vadd_s64(vget_low_s64(corr_QC3_s64x2), vget_low_s64(t3_s64x2));
- vst1_s64(corr_QC, corr_QC_s64x1);
+ // Accessed one extra head element of state_QS.
+ CORRELATION_EPILOG_4(corr_QC, state_QS, 0, input_QS1_s32x4, warping_Q16_s32x2, tmp1_QS1_s32x4)
+
+ {
+ int64x1_t corr_QC_s64x1;
+ int64x2_t corr_QC0_s64x2;
+ int64x2_t t0_s64x2, t1_s64x2;
+
+ /* -------------------- epilog 4 -------------------- */
+ corr_QC_s64x1 = vld1_s64 (corr_QC);
+ corr_QC0_s64x2 = vld1q_s64(corr_QC + 1);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+ corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t0_s64x2));
+ corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t1_s64x2);
+ vst1_s64 (corr_QC, corr_QC_s64x1);
+ vst1q_s64(corr_QC + 1, corr_QC0_s64x2);
+ vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4);
+
+ tmp2_QS_s32x2 = vld1_s32(state_QS + 1);
+ tmp1_QS_s32x2 = vsub_s32(vld1_s32(tmp1_QS + 5), vget_high_s32(tmp1_QS1_s32x4));
+ t1_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32x2);
+ tmp1_QS_s32x2 = vshrn_n_s64(t1_s64x2, 16);
+ tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+ vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1);
+
+ /* -------------------- epilog 5 & 6 -------------------- */
+ vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1);
+ tmp2_QS_s32x2 = vsub_s32(tmp1_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32)));
+ t1_s64x2 = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32x2);
+ tmp2_QS_s32x2 = vshrn_n_s64(t1_s64x2, 16);
+ tmp2_QS_s32x2 = vadd_s32(vget_high_s32(tmp1_QS1_s32x4), tmp2_QS_s32x2);
+ vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0);
+
+ corr_QC0_s64x2 = vld1q_s64(corr_QC);
+ t1_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4));
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+ corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t1_s64x2);
+ vst1_s64(corr_QC + 1, vget_high_s64(corr_QC0_s64x2));
+ t1_s64x2 = vmull_s32(tmp2_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)), 32)));
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+ corr_QC_s64x1 = vadd_s64(vget_low_s64(corr_QC0_s64x2), vget_low_s64(t1_s64x2));
+ vst1_s64(corr_QC, corr_QC_s64x1);
+ }
}
}
@@ -470,14 +405,16 @@ void silk_warped_autocorrelation_FIX_neon(
lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
*scale = -( QC + lsh );
silk_assert( *scale >= -30 && *scale <= 12 );
- const int64x2_t lsh_s64x2 = vdupq_n_s64(lsh);
+ lsh_s64x2 = vdupq_n_s64(lsh);
for( i = 0; i <= order - 3; i += 4 ) {
- int64x2_t corr_QC0_s64x2 = vld1q_s64(corr_QC + i);
- int64x2_t corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2);
- corr_QC0_s64x2 = vshlq_s64(corr_QC0_s64x2, lsh_s64x2);
- corr_QC1_s64x2 = vshlq_s64(corr_QC1_s64x2, lsh_s64x2);
- int32x4_t corr_s32x4 = vcombine_s32(vmovn_s64(corr_QC1_s64x2), vmovn_s64(corr_QC0_s64x2));
- corr_s32x4 = vrev64q_s32(corr_s32x4);
+ int32x4_t corr_s32x4;
+ int64x2_t corr_QC0_s64x2, corr_QC1_s64x2;
+ corr_QC0_s64x2 = vld1q_s64(corr_QC + i);
+ corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2);
+ corr_QC0_s64x2 = vshlq_s64(corr_QC0_s64x2, lsh_s64x2);
+ corr_QC1_s64x2 = vshlq_s64(corr_QC1_s64x2, lsh_s64x2);
+ corr_s32x4 = vcombine_s32(vmovn_s64(corr_QC1_s64x2), vmovn_s64(corr_QC0_s64x2));
+ corr_s32x4 = vrev64q_s32(corr_s32x4);
vst1q_s32(corr + order - i - 3, corr_s32x4);
}
if( lsh >= 0 ) {
--
2.8.0.rc3.226.g39d4020
More information about the opus
mailing list