[opus] [PATCH] Optimize silk_warped_autocorrelation_FIX() for ARM NEON
Linfeng Zhang
linfengz at google.com
Fri Jul 1 16:25:57 UTC 2016
Create silk_warped_autocorrelation_FIX_c_opt() which unrolls and parallelizes
input by 8. It has very long prolog and epilog, but this is the cost to get
good speed on this heavily hit function.
This function may be the code base for optimization on different CPUs.
Create ARM NEON intrinsics optimization silk_warped_autocorrelation_FIX_neon().
Create test tests/test_unit_optimization to unit test optimizations.
---
.gitignore | 2 +-
Makefile.am | 44 +-
celt/tests/test_unit_optimization_lpc.c | 56 +--
silk/arm/arm_silk_map.c | 20 +
silk/arm/warped_autocorrelation_FIX_arm.h | 65 +++
silk/arm/warped_autocorrelation_FIX_neon_intr.c | 495 +++++++++++++++++++++
silk/fixed/main_FIX.h | 15 +-
.../fixed/mips/warped_autocorrelation_FIX_mipsr1.h | 6 -
silk/fixed/warped_autocorrelation_FIX.c | 7 +-
...test_unit_optimization_warped_autocorrelation.c | 441 ++++++++++++++++++
silk_headers.mk | 1 +
silk_sources.mk | 3 +-
tests/test_unit_optimization.c | 64 +++
13 files changed, 1153 insertions(+), 66 deletions(-)
create mode 100644 silk/arm/warped_autocorrelation_FIX_arm.h
create mode 100644 silk/arm/warped_autocorrelation_FIX_neon_intr.c
create mode 100644 silk/tests/test_unit_optimization_warped_autocorrelation.c
create mode 100644 tests/test_unit_optimization.c
diff --git a/.gitignore b/.gitignore
index 9e824af..05d0582 100644
--- a/.gitignore
+++ b/.gitignore
@@ -49,6 +49,7 @@ tests/test_opus_api
tests/test_opus_decode
tests/test_opus_encode
tests/test_opus_padding
+tests/test_unit_optimization
celt/arm/armopts.s
celt/dump_modes/dump_modes
celt/tests/test_unit_cwrs32
@@ -57,7 +58,6 @@ celt/tests/test_unit_entropy
celt/tests/test_unit_laplace
celt/tests/test_unit_mathops
celt/tests/test_unit_mdct
-celt/tests/test_unit_optimization_lpc
celt/tests/test_unit_rotation
celt/tests/test_unit_types
doc/doxygen_sqlite3.db
diff --git a/Makefile.am b/Makefile.am
index 08d26c5..2bfb923 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -84,9 +84,36 @@ pkginclude_HEADERS = include/opus.h include/opus_multistream.h include/opus_type
noinst_HEADERS = $(OPUS_HEAD) $(SILK_HEAD) $(CELT_HEAD)
if EXTRA_PROGRAMS
-noinst_PROGRAMS = opus_demo repacketizer_demo opus_compare tests/test_opus_api tests/test_opus_encode tests/test_opus_decode tests/test_opus_padding celt/tests/test_unit_cwrs32 celt/tests/test_unit_dft celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_mathops celt/tests/test_unit_mdct celt/tests/test_unit_optimization_lpc celt/tests/test_unit_rotation celt/tests/test_unit_types
-
-TESTS = celt/tests/test_unit_types celt/tests/test_unit_mathops celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_dft celt/tests/test_unit_mdct celt/tests/test_unit_optimization_lpc celt/tests/test_unit_rotation celt/tests/test_unit_cwrs32 tests/test_opus_api tests/test_opus_decode tests/test_opus_encode tests/test_opus_padding
+noinst_PROGRAMS = opus_demo \
+ repacketizer_demo \
+ opus_compare \
+ celt/tests/test_unit_cwrs32 \
+ celt/tests/test_unit_dft \
+ celt/tests/test_unit_entropy \
+ celt/tests/test_unit_laplace \
+ celt/tests/test_unit_mathops \
+ celt/tests/test_unit_mdct \
+ celt/tests/test_unit_rotation \
+ celt/tests/test_unit_types \
+ tests/test_opus_api \
+ tests/test_opus_encode \
+ tests/test_opus_decode \
+ tests/test_opus_padding \
+ tests/test_unit_optimization
+
+TESTS = celt/tests/test_unit_types \
+ celt/tests/test_unit_mathops \
+ celt/tests/test_unit_entropy \
+ celt/tests/test_unit_laplace \
+ celt/tests/test_unit_dft \
+ celt/tests/test_unit_mdct \
+ celt/tests/test_unit_rotation \
+ celt/tests/test_unit_cwrs32 \
+ tests/test_opus_api \
+ tests/test_opus_decode \
+ tests/test_opus_encode \
+ tests/test_opus_padding \
+ tests/test_unit_optimization
opus_demo_SOURCES = src/opus_demo.c
@@ -111,6 +138,9 @@ tests_test_opus_decode_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
tests_test_opus_padding_SOURCES = tests/test_opus_padding.c tests/test_opus_common.h
tests_test_opus_padding_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
+tests_test_unit_optimization_SOURCES = tests/test_unit_optimization.c
+tests_test_unit_optimization_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
+
celt_tests_test_unit_cwrs32_SOURCES = celt/tests/test_unit_cwrs32.c
celt_tests_test_unit_cwrs32_LDADD = $(LIBM)
@@ -138,12 +168,6 @@ if OPUS_ARM_EXTERNAL_ASM
celt_tests_test_unit_mdct_LDADD += libarmasm.la
endif
-celt_tests_test_unit_optimization_lpc_SOURCES = celt/tests/test_unit_optimization_lpc.c
-celt_tests_test_unit_optimization_lpc_LDADD = $(NE10_LIBS) $(LIBM)
-if OPUS_ARM_EXTERNAL_ASM
-celt_tests_test_unit_optimization_lpc_LDADD += libarmasm.la
-endif
-
celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c
celt_tests_test_unit_rotation_LDADD = $(NE10_LIBS) $(LIBM)
if OPUS_ARM_EXTERNAL_ASM
@@ -283,7 +307,7 @@ OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \
$(celt_tests_test_unit_rotation_SOURCES:.c=.o) \
$(celt_tests_test_unit_mdct_SOURCES:.c=.o) \
$(celt_tests_test_unit_dft_SOURCES:.c=.o) \
- $(celt_tests_test_unit_optimization_lpc_SOURCES:.c=.o)
+ $(tests_test_unit_optimization_SOURCES:.c=.o)
if HAVE_SSE
SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo)
diff --git a/celt/tests/test_unit_optimization_lpc.c b/celt/tests/test_unit_optimization_lpc.c
index 903c838..7247046 100644
--- a/celt/tests/test_unit_optimization_lpc.c
+++ b/celt/tests/test_unit_optimization_lpc.c
@@ -25,56 +25,39 @@
*/
#ifdef HAVE_CONFIG_H
-#include "config.h"
+# include "config.h"
#endif
#define SKIP_CONFIG_H
#ifndef CUSTOM_MODES
-#define CUSTOM_MODES
+# define CUSTOM_MODES
#endif
#include <stdio.h>
+#include <string.h>
-#define CELT_C
-#include "stack_alloc.h"
-#include "mathops.c"
-#include "entcode.c"
+#ifndef CELT_C
+# define CELT_C
+#endif
+#include "celt_lpc.h"
+#include "modes.h"
#ifdef FIXED_POINT
-#if defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
-# include "celt_lpc.c"
-# include "pitch.c"
-# include "x86/x86cpu.c"
-# include "x86/celt_lpc_sse.c"
-# include "x86/pitch_sse2.c"
-# include "x86/pitch_sse4_1.c"
-# include "x86/x86_celt_map.c"
-#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
-# include "celt_lpc.c"
-# include "pitch.c"
-# include "arm/armcpu.c"
-# include "arm/arm_celt_map.c"
-# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
-# include "arm/celt_lpc_neon_intr.c"
-# include "arm/celt_neon_intr.c"
-# endif
-#endif
-
#define MAX_ORDER 32
-void test_fir(int arch)
+static int test_fir(int arch)
{
opus_val16 x[MAX_PERIOD+MAX_ORDER];
opus_val16 num[MAX_ORDER];
opus_val16 yorg[MAX_PERIOD], yopt[MAX_PERIOD];
int N, ord;
-
unsigned int i;
+
+ printf("%50s", "celt_fir() ...");
for(ord=0;ord<=MAX_ORDER;ord++)
{
- printf("ord=%2d", ord);
for(N=ord;N<=MAX_PERIOD;N++) /* N is larger than or equal to ord. */
{
for (i=0;i<MAX_PERIOD+MAX_ORDER;++i)
@@ -95,7 +78,7 @@ void test_fir(int arch)
celt_fir (x+MAX_ORDER, num, yopt, N, ord, arch);
if (memcmp(yorg, yopt, sizeof(yorg)))
{
- printf(" N=%3d failed!\nError in lpc unit test!!!\n", N);
+ printf("ord=%2d N=%3d failed!\nError in lpc unit test!!!\n", ord, N);
for (i=0;i<sizeof(yorg) / sizeof(*yorg);i++)
{
if (yorg[i] != yopt[i])
@@ -103,20 +86,11 @@ void test_fir(int arch)
printf("yorg[%3d]=%d, yopt[%3d]=%d\n", i, yorg[i], i, yopt[i]);
}
}
- exit(0);
+ return -1;
}
}
- printf(" passed!\n");
}
-}
-#endif /* FIXED_POINT */
-
-int main(void)
-{
-#ifdef FIXED_POINT
- ALLOC_STACK;
- int arch = opus_select_arch();
- test_fir(arch);
-#endif /* FIXED_POINT */
+ printf(" passed!\n");
return 0;
}
+#endif /* FIXED_POINT */
diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c
index 9bd86a7..49fd672 100644
--- a/silk/arm/arm_silk_map.c
+++ b/silk/arm/arm_silk_map.c
@@ -28,6 +28,7 @@ POSSIBILITY OF SUCH DAMAGE.
# include "config.h"
#endif
+#include "main_FIX.h"
#include "NSQ.h"
#if defined(OPUS_HAVE_RTCD)
@@ -52,4 +53,23 @@ opus_int32
# endif
+#if defined(FIXED_POINT) && \
+ defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+
+void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])(
+ opus_int32 *corr, /* O Result [order + 1] */
+ opus_int *scale, /* O Scaling of the correlation vector */
+ const opus_int16 *input, /* I Input data to correlate */
+ const opus_int warping_Q16, /* I Warping coefficient */
+ const opus_int length, /* I Length of input */
+ const opus_int order /* I Correlation order (even) */
+) = {
+ silk_warped_autocorrelation_FIX_c, /* ARMv4 */
+ silk_warped_autocorrelation_FIX_c, /* EDSP */
+ silk_warped_autocorrelation_FIX_c, /* Media */
+ silk_warped_autocorrelation_FIX_neon, /* Neon */
+};
+
+#endif
+
#endif /* OPUS_HAVE_RTCD */
diff --git a/silk/arm/warped_autocorrelation_FIX_arm.h b/silk/arm/warped_autocorrelation_FIX_arm.h
new file mode 100644
index 0000000..ee892bf
--- /dev/null
+++ b/silk/arm/warped_autocorrelation_FIX_arm.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(WARPED_AUTOCORRELATION_FIX_ARM_H)
+# define WARPED_AUTOCORRELATION_FIX_ARM_H
+
+# include "celt/arm/armcpu.h"
+
+# if defined(FIXED_POINT)
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON)
+void silk_warped_autocorrelation_FIX_neon(
+ opus_int32 *corr, /* O Result [order + 1] */
+ opus_int *scale, /* O Scaling of the correlation vector */
+ const opus_int16 *input, /* I Input data to correlate */
+ const opus_int warping_Q16, /* I Warping coefficient */
+ const opus_int length, /* I Length of input */
+ const opus_int order /* I Correlation order (even) */
+);
+# endif
+
+# if !defined(OPUS_HAVE_RTCD)
+# define OVERRIDE_silk_warped_autocorrelation_FIX (1)
+# define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order) \
+ ((void)(arch),PRESUME_NEON(silk_warped_autocorrelation_FIX)(corr, scale, input, warping_Q16, length, order))
+# endif
+
+#if !defined(OVERRIDE_silk_warped_autocorrelation_FIX)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_ASM) \
+ || (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) \
+ && !defined(OPUS_ARM_PRESUME_NEON_INTR)))
+extern void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK+1])(opus_int32*, opus_int*, const opus_int16*, const opus_int, const opus_int, const opus_int);
+
+# define OVERRIDE_silk_warped_autocorrelation_FIX
+# define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order) \
+ ((*SILK_WARPED_AUTOCORRELATION_FIX_IMPL[(arch)&OPUS_ARCHMASK])(corr, scale, input, warping_Q16, length, order))
+# endif
+#endif
+#endif /* end FIXED_POINT */
+
+#endif /* end WARPED_AUTOCORRELATION_FIX_ARM_H */
diff --git a/silk/arm/warped_autocorrelation_FIX_neon_intr.c b/silk/arm/warped_autocorrelation_FIX_neon_intr.c
new file mode 100644
index 0000000..80dd949
--- /dev/null
+++ b/silk/arm/warped_autocorrelation_FIX_neon_intr.c
@@ -0,0 +1,495 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+ @file warped_autocorrelation_FIX_neon_intr.c
+ @brief ARM Neon Intrinsic optimizations for silk silk_warped_autocorrelation_FIX functions
+ */
+
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+
+#include <arm_neon.h>
+#include "stack_alloc.h"
+#include "main_FIX.h"
+
+#ifdef FIXED_POINT
+
+#define NUM_PARALLEL_INPUTS 8
+
+void silk_warped_autocorrelation_FIX_neon(
+ opus_int32 *corr, /* O Result [order + 1] */
+ opus_int *scale, /* O Scaling of the correlation vector */
+ const opus_int16 *input, /* I Input data to correlate */
+ const opus_int warping_Q16, /* I Warping coefficient */
+ const opus_int length, /* I Length of input */
+ const opus_int order /* I Correlation order (even) */
+)
+{
+ opus_int n = 0, i, lsh;
+ opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS];
+ opus_int32 input_QS[NUM_PARALLEL_INPUTS];
+ opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 3 ] = { 0 }; // Create two extra entries.
+ opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in the last prolog and the last inner loop, and one extra end entry in the last prolog.
+ opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+
+ /* Order must be even */
+ silk_assert( ( order & 1 ) == 0 );
+ silk_assert( 2 * QS - QC >= 0 );
+
+ /* Loop over samples */
+ if( order >= NUM_PARALLEL_INPUTS - 2 ) {
+ const int32x2_t warping_Q16_s32 = vdup_n_s32(warping_Q16);
+ for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n += NUM_PARALLEL_INPUTS ) {
+ int32x4_t tmp1_QS0_s32x4, tmp1_QS1_s32x4, tmp2_QS0_s32x4, tmp2_QS1_s32x4;
+ int64x2_t corr_QC0_s64x2, corr_QC1_s64x2, corr_QC2_s64x2, corr_QC3_s64x2;
+ int64x2_t t0_s64x2, t1_s64x2, t2_s64x2, t3_s64x2;
+ int32x2_t tmp1_QS_s32x2, tmp2_QS_s32x2;
+ int64x1_t corr_QC_s64x1;
+ const int32x4_t input_QS0_s32x4 = vshll_n_s16(vld1_s16(input + n), QS);
+ const int32x4_t input_QS1_s32x4 = vshll_n_s16(vld1_s16(input + n + 4), QS);
+ vst1q_s32(tmp1_QS, input_QS0_s32x4);
+ vst1q_s32(tmp1_QS + 4, input_QS1_s32x4);
+
+ /* Loop over allpass sections */
+
+ /* -------------------- prolog 0 -------------------- */
+
+ tmp1_QS_s32x2 = vget_low_s32(input_QS0_s32x4);
+ tmp2_QS_s32x2 = vld1_s32(state_QS + order); // Accessed one extra end entry.
+ vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
+ corr_QC_s64x1 = vld1_s64(corr_QC + order);
+ t0_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4));
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+ corr_QC_s64x1 = vadd_s64(corr_QC_s64x1, vget_low_s64(t0_s64x2));
+ vst1_s64(corr_QC + order, corr_QC_s64x1);
+ tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + order - 1), tmp1_QS_s32x2);
+ t0_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+ tmp1_QS_s32x2 = vshrn_n_s64(t0_s64x2, 16);
+ tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+ tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 1, tmp1_QS_s32x2, 1);
+
+ /* -------------------- prolog 1 -------------------- */
+
+ tmp2_QS_s32x2 = vld1_s32(state_QS + order - 1);
+ vst1_s32(state_QS + order - 1, tmp1_QS_s32x2);
+ corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 1);
+ t0_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32(input_QS0_s32x4));
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+ corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+ vst1q_s64(corr_QC + order - 1, corr_QC0_s64x2);
+ tmp1_QS_s32x2 = vsub_s32(vld1_s32(state_QS + order - 2), tmp1_QS_s32x2);
+ t0_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+ tmp1_QS_s32x2 = vshrn_n_s64(t0_s64x2, 16);
+ tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+ tmp1_QS0_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4));
+
+ /* -------------------- prolog 2 -------------------- */
+
+ tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry.
+ vst1q_s32(state_QS + order - 2, tmp1_QS0_s32x4); // Saving one extra entry is OK.
+ corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 2);
+ corr_QC_s64x1 = vld1_s64 (corr_QC + order);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+ corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+ corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_low_s64(t1_s64x2));
+ vst1q_s64(corr_QC + order - 2, corr_QC0_s64x2);
+ vst1_s64 (corr_QC + order, corr_QC_s64x1);
+ tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS0_s32x4);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+ tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+ tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+ tmp1_QS0_s32x4 = vld1q_lane_s32(tmp1_QS + 3, tmp1_QS0_s32x4, 3);
+
+ /* -------------------- prolog 3 -------------------- */
+
+ tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 3);
+ vst1q_s32(state_QS + order - 3, tmp1_QS0_s32x4);
+ corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 3);
+ corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 1);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+ corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+ corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+ vst1q_s64(corr_QC + order - 3, corr_QC0_s64x2);
+ vst1q_s64(corr_QC + order - 1, corr_QC1_s64x2);
+ tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 4), tmp1_QS0_s32x4);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+ tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+ tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+ tmp1_QS_s32x2 = vget_low_s32(input_QS1_s32x4);
+
+ /* -------------------- prolog 4 -------------------- */
+
+ tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 4);
+ tmp2_QS_s32x2 = vld1_lane_s32(state_QS + order, tmp2_QS_s32x2, 0);
+ vst1q_s32(state_QS + order - 4, tmp1_QS0_s32x4);
+ vst1_lane_s32(state_QS + order, tmp1_QS_s32x2, 0);
+ corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 4);
+ corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 2);
+ corr_QC_s64x1 = vld1_s64 (corr_QC + order);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+ t2_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32 (input_QS1_s32x4));
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+ t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+ corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+ corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+ corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_low_s64(t2_s64x2));
+ vst1q_s64(corr_QC + order - 4, corr_QC0_s64x2);
+ vst1q_s64(corr_QC + order - 2, corr_QC1_s64x2);
+ vst1_s64 (corr_QC + order, corr_QC_s64x1);
+ tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 5), tmp1_QS0_s32x4);
+ tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS + order - 1), tmp1_QS_s32x2);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+ t2_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+ tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+ tmp1_QS_s32x2 = vshrn_n_s64(t2_s64x2, 16);
+ tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+ tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2);
+ tmp1_QS_s32x2 = vld1_lane_s32(tmp1_QS + 5, tmp1_QS_s32x2, 1);
+
+ /* -------------------- prolog 5 -------------------- */
+
+ tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 5);
+ tmp2_QS_s32x2 = vld1_s32 (state_QS + order - 1);
+ vst1q_s32(state_QS + order - 5, tmp1_QS0_s32x4);
+ vst1_s32 (state_QS + order - 1, tmp1_QS_s32x2);
+ corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 5);
+ corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 3);
+ corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 1);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+ t2_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_low_s32 (input_QS1_s32x4));
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+ t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+ corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+ corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+ corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+ vst1q_s64(corr_QC + order - 5, corr_QC0_s64x2);
+ vst1q_s64(corr_QC + order - 3, corr_QC1_s64x2);
+ vst1q_s64(corr_QC + order - 1, corr_QC2_s64x2);
+ tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 6), tmp1_QS0_s32x4);
+ tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS + order - 2), tmp1_QS_s32x2);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+ t2_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+ tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+ tmp1_QS_s32x2 = vshrn_n_s64(t2_s64x2, 16);
+ tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+ tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2);
+ tmp1_QS1_s32x4 = vcombine_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4));
+
+ /* -------------------- prolog 6 -------------------- */
+
+ tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - 6);
+ tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - 2); // Accessed one extra end entry.
+ vst1q_s32(state_QS + order - 6, tmp1_QS0_s32x4);
+ vst1q_s32(state_QS + order - 2, tmp1_QS1_s32x4); // Saving one extra entry is OK.
+ corr_QC0_s64x2 = vld1q_s64(corr_QC + order - 6);
+ corr_QC1_s64x2 = vld1q_s64(corr_QC + order - 4);
+ corr_QC2_s64x2 = vld1q_s64(corr_QC + order - 2);
+ corr_QC_s64x1 = vld1_s64 (corr_QC + order);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+ t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+ t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+ corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+ corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+ corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+ corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_low_s64(t3_s64x2));
+ vst1q_s64(corr_QC + order - 6, corr_QC0_s64x2);
+ vst1q_s64(corr_QC + order - 4, corr_QC1_s64x2);
+ vst1q_s64(corr_QC + order - 2, corr_QC2_s64x2);
+ vst1_s64 (corr_QC + order, corr_QC_s64x1);
+ tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 7), tmp1_QS0_s32x4); // Accessed one extra head entry when order is 6.
+ tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - 3), tmp1_QS1_s32x4);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+ tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+ tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+ tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+ tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+ tmp1_QS1_s32x4 = vld1q_lane_s32(tmp1_QS + 7, tmp1_QS1_s32x4, 3);
+
+ /* -------------------- kernel loop -------------------- */
+
+ for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) {
+ /* Output of allpass section */
+ tmp2_QS0_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1);
+ tmp2_QS1_s32x4 = vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5);
+ vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 1, tmp1_QS0_s32x4);
+ vst1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 5, tmp1_QS1_s32x4);
+ corr_QC0_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1);
+ corr_QC1_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3);
+ corr_QC2_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5);
+ corr_QC3_s64x2 = vld1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+ t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+ t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+ corr_QC0_s64x2 = vaddq_s64(corr_QC0_s64x2, t0_s64x2);
+ corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+ corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+ corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+ vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 1, corr_QC0_s64x2);
+ vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 3, corr_QC1_s64x2);
+ vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 5, corr_QC2_s64x2);
+ vst1q_s64(corr_QC + order - i - NUM_PARALLEL_INPUTS + 7, corr_QC3_s64x2);
+ tmp1_QS0_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS), tmp1_QS0_s32x4); // Accessed one extra head entry in the last loop.
+ tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + order - i - NUM_PARALLEL_INPUTS + 4), tmp1_QS1_s32x4);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), warping_Q16_s32);
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), warping_Q16_s32);
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+ tmp1_QS0_s32x4 = vcombine_s32(vshrn_n_s64(t0_s64x2, 16), vshrn_n_s64(t1_s64x2, 16));
+ tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+ tmp1_QS0_s32x4 = vaddq_s32(tmp1_QS0_s32x4, tmp2_QS0_s32x4);
+ tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+ }
+
+ /* -------------------- epilog 0 -------------------- */
+
+ tmp2_QS_s32x2 = vld1_s32(state_QS + 1);
+ tmp2_QS1_s32x4 = vld1q_s32(state_QS + 3);
+ vst1q_s32(state_QS - 1, tmp1_QS0_s32x4); // Saving one extra entry is OK.
+ vst1q_s32(state_QS + 3, tmp1_QS1_s32x4);
+ corr_QC_s64x1 = vld1_s64 (corr_QC);
+ corr_QC1_s64x2 = vld1q_s64(corr_QC + 1);
+ corr_QC2_s64x2 = vld1q_s64(corr_QC + 3);
+ corr_QC3_s64x2 = vld1q_s64(corr_QC + 5);
+ t0_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS0_s32x4), vget_low_s32 (input_QS0_s32x4));
+ t1_s64x2 = vmull_s32(vget_high_s32(tmp1_QS0_s32x4), vget_high_s32(input_QS0_s32x4));
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+ t0_s64x2 = vshrq_n_s64(t0_s64x2, 2 * QS - QC);
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+ t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+ t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+ corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t0_s64x2));
+ corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+ corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+ corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+ vst1_s64 (corr_QC + 0, corr_QC_s64x1);
+ vst1q_s64(corr_QC + 1, corr_QC1_s64x2);
+ vst1q_s64(corr_QC + 3, corr_QC2_s64x2);
+ vst1q_s64(corr_QC + 5, corr_QC3_s64x2);
+ tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS), vget_high_s32(tmp1_QS0_s32x4));
+ tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 2), tmp1_QS1_s32x4);
+ t1_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+ tmp1_QS_s32x2 = vshrn_n_s64(t1_s64x2, 16);
+ tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+ tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2);
+ tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+ /* -------------------- epilog 1 -------------------- */
+
+ tmp2_QS_s32x2 = vld1_s32 (state_QS);
+ tmp2_QS1_s32x4 = vld1q_s32(state_QS + 2);
+ vst1_s32 (state_QS, tmp1_QS_s32x2);
+ vst1q_s32(state_QS + 2, tmp1_QS1_s32x4);
+ corr_QC1_s64x2 = vld1q_s64(corr_QC + 0);
+ corr_QC2_s64x2 = vld1q_s64(corr_QC + 2);
+ corr_QC3_s64x2 = vld1q_s64(corr_QC + 4);
+ t1_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4));
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+ t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+ t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+ corr_QC1_s64x2 = vaddq_s64(corr_QC1_s64x2, t1_s64x2);
+ corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+ corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+ vst1q_s64(corr_QC + 0, corr_QC1_s64x2);
+ vst1q_s64(corr_QC + 2, corr_QC2_s64x2);
+ vst1q_s64(corr_QC + 4, corr_QC3_s64x2);
+ tmp1_QS_s32x2 = vsub_s32 (vld1_s32 (state_QS - 1), tmp1_QS_s32x2); // Accessed one extra head entry.
+ tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS + 1), tmp1_QS1_s32x4);
+ t1_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+ tmp1_QS_s32x2 = vshrn_n_s64(t1_s64x2, 16);
+ tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+ tmp1_QS_s32x2 = vadd_s32 (tmp1_QS_s32x2, tmp2_QS_s32x2);
+ tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+ /* -------------------- epilog 2 -------------------- */
+
+ tmp2_QS1_s32x4 = vld1q_s32(state_QS + 1);
+ vst1_lane_s32(state_QS, tmp1_QS_s32x2, 1);
+ vst1q_s32 (state_QS + 1, tmp1_QS1_s32x4);
+ corr_QC_s64x1 = vld1_s64(corr_QC);
+ corr_QC2_s64x2 = vld1q_s64(corr_QC + 1);
+ corr_QC3_s64x2 = vld1q_s64(corr_QC + 3);
+ t1_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS0_s32x4));
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+ t1_s64x2 = vshrq_n_s64(t1_s64x2, 2 * QS - QC);
+ t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+ t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+ corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t1_s64x2));
+ corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+ corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+ vst1_s64 (corr_QC + 0, corr_QC_s64x1);
+ vst1q_s64(corr_QC + 1, corr_QC2_s64x2);
+ vst1q_s64(corr_QC + 3, corr_QC3_s64x2);
+ tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS), tmp1_QS1_s32x4);
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+ tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+ tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+ /* -------------------- epilog 3 -------------------- */
+
+ tmp2_QS1_s32x4 = vld1q_s32(state_QS);
+ vst1q_s32(state_QS, tmp1_QS1_s32x4);
+ corr_QC2_s64x2 = vld1q_s64(corr_QC);
+ corr_QC3_s64x2 = vld1q_s64(corr_QC + 2);
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+ t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+ t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+ corr_QC2_s64x2 = vaddq_s64(corr_QC2_s64x2, t2_s64x2);
+ corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+ vst1q_s64(corr_QC, corr_QC2_s64x2);
+ vst1q_s64(corr_QC + 2, corr_QC3_s64x2);
+ tmp1_QS1_s32x4 = vsubq_s32(vld1q_s32(state_QS - 1), tmp1_QS1_s32x4); // Accessed one extra head entry.
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), warping_Q16_s32);
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), warping_Q16_s32);
+ tmp1_QS1_s32x4 = vcombine_s32(vshrn_n_s64(t2_s64x2, 16), vshrn_n_s64(t3_s64x2, 16));
+ tmp1_QS1_s32x4 = vaddq_s32(tmp1_QS1_s32x4, tmp2_QS1_s32x4);
+
+ /* -------------------- epilog 4 -------------------- */
+
+ corr_QC_s64x1 = vld1_s64 (corr_QC);
+ corr_QC3_s64x2 = vld1q_s64(corr_QC + 1);
+ t2_s64x2 = vmull_s32(vget_low_s32 (tmp1_QS1_s32x4), vget_low_s32 (input_QS1_s32x4));
+ t3_s64x2 = vmull_s32(vget_high_s32(tmp1_QS1_s32x4), vget_high_s32(input_QS1_s32x4));
+ t2_s64x2 = vshrq_n_s64(t2_s64x2, 2 * QS - QC);
+ t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+ corr_QC_s64x1 = vadd_s64 (corr_QC_s64x1, vget_high_s64(t2_s64x2));
+ corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+ vst1_s64 (corr_QC, corr_QC_s64x1);
+ vst1q_s64(corr_QC + 1, corr_QC3_s64x2);
+ vst1q_s32(tmp1_QS + 4, tmp1_QS1_s32x4);
+
+ tmp2_QS_s32x2 = vld1_s32(state_QS + 1);
+ tmp1_QS_s32x2 = vsub_s32(vld1_s32(tmp1_QS + 5), vget_high_s32(tmp1_QS1_s32x4));
+ t3_s64x2 = vmull_s32(tmp1_QS_s32x2, warping_Q16_s32);
+ tmp1_QS_s32x2 = vshrn_n_s64(t3_s64x2, 16);
+ tmp1_QS_s32x2 = vadd_s32(tmp1_QS_s32x2, tmp2_QS_s32x2);
+ vst1_lane_s32(state_QS + 1, tmp1_QS_s32x2, 1);
+
+ /* -------------------- epilog 5 & 6 -------------------- */
+
+ vst1_lane_s32(state_QS + 2, vget_high_s32(tmp1_QS1_s32x4), 1);
+ tmp2_QS_s32x2 = vsub_s32(tmp1_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(tmp1_QS_s32x2), 32)));
+ t3_s64x2 = vmull_s32(tmp2_QS_s32x2, warping_Q16_s32);
+ tmp2_QS_s32x2 = vshrn_n_s64(t3_s64x2, 16);
+ tmp2_QS_s32x2 = vadd_s32(vget_high_s32(tmp1_QS1_s32x4), tmp2_QS_s32x2);
+ vst1_lane_s32(state_QS, tmp2_QS_s32x2, 0);
+
+ corr_QC3_s64x2 = vld1q_s64(corr_QC);
+ t3_s64x2 = vmull_s32(tmp1_QS_s32x2, vget_high_s32(input_QS1_s32x4));
+ t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+ corr_QC3_s64x2 = vaddq_s64(corr_QC3_s64x2, t3_s64x2);
+ vst1_s64(corr_QC + 1, vget_high_s64(corr_QC3_s64x2));
+ t3_s64x2 = vmull_s32(tmp2_QS_s32x2, vreinterpret_s32_s64(vshr_n_s64(vreinterpret_s64_s32(vget_high_s32(input_QS1_s32x4)), 32)));
+ t3_s64x2 = vshrq_n_s64(t3_s64x2, 2 * QS - QC);
+ corr_QC_s64x1 = vadd_s64(vget_low_s64(corr_QC3_s64x2), vget_low_s64(t3_s64x2));
+ vst1_s64(corr_QC, corr_QC_s64x1);
+ }
+ }
+
+ for( ; n < length; n++ ) {
+ input_QS[ 0 ] = tmp1_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS );
+ /* Loop over allpass sections */
+ for( i = 0; i <= order; i++ ) {
+ /* Output of allpass section */
+ tmp2_QS[ 0 ] = silk_SMLAWB( state_QS[ order - i ], state_QS[ order - i - 1 ] - tmp1_QS[ 0 ], warping_Q16 );
+ state_QS[ order - i ] = tmp1_QS[ 0 ];
+ corr_QC[ order - i ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+ tmp1_QS[ 0 ] = tmp2_QS[ 0 ];
+ }
+ }
+ lsh = silk_CLZ64( corr_QC[ order ] ) - 35;
+ lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
+ *scale = -( QC + lsh );
+ silk_assert( *scale >= -30 && *scale <= 12 );
+ const int64x2_t lsh_s64x2 = vdupq_n_s64(lsh);
+ for( i = 0; i <= order - 3; i += 4 ) {
+ int64x2_t corr_QC0_s64x2 = vld1q_s64(corr_QC + i);
+ int64x2_t corr_QC1_s64x2 = vld1q_s64(corr_QC + i + 2);
+ corr_QC0_s64x2 = vshlq_s64(corr_QC0_s64x2, lsh_s64x2);
+ corr_QC1_s64x2 = vshlq_s64(corr_QC1_s64x2, lsh_s64x2);
+ int32x4_t corr_s32x4 = vcombine_s32(vmovn_s64(corr_QC1_s64x2), vmovn_s64(corr_QC0_s64x2));
+ corr_s32x4 = vrev64q_s32(corr_s32x4);
+ vst1q_s32(corr + order - i - 3, corr_s32x4);
+ }
+ if( lsh >= 0 ) {
+ for( ; i <= order; i++ ) {
+ corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QC[ i ], lsh ) );
+ }
+ } else {
+ for( ; i <= order; i++ ) {
+ corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QC[ i ], -lsh ) );
+ }
+ }
+ silk_assert( corr_QC[ order ] >= 0 ); /* If breaking, decrease QC*/
+}
+
+#endif /* FIXED_POINT */
diff --git a/silk/fixed/main_FIX.h b/silk/fixed/main_FIX.h
index 375b5eb..02f6802 100644
--- a/silk/fixed/main_FIX.h
+++ b/silk/fixed/main_FIX.h
@@ -36,6 +36,11 @@ POSSIBILITY OF SUCH DAMAGE.
#include "debug.h"
#include "entenc.h"
+#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
+ || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/warped_autocorrelation_FIX_arm.h"
+#endif
+
#ifndef FORCE_CPP_BUILD
#ifdef __cplusplus
extern "C"
@@ -47,6 +52,9 @@ extern "C"
#define silk_encode_do_VAD_Fxx silk_encode_do_VAD_FIX
#define silk_encode_frame_Fxx silk_encode_frame_FIX
+#define QC 10
+#define QS 14
+
/*********************/
/* Encoder Functions */
/*********************/
@@ -121,7 +129,7 @@ void silk_noise_shape_analysis_FIX(
);
/* Autocorrelations for a warped frequency axis */
-void silk_warped_autocorrelation_FIX(
+void silk_warped_autocorrelation_FIX_c(
opus_int32 *corr, /* O Result [order + 1] */
opus_int *scale, /* O Scaling of the correlation vector */
const opus_int16 *input, /* I Input data to correlate */
@@ -130,6 +138,11 @@ void silk_warped_autocorrelation_FIX(
const opus_int order /* I Correlation order (even) */
);
+#if !defined(OVERRIDE_silk_warped_autocorrelation_FIX)
+#define silk_warped_autocorrelation_FIX(corr, scale, input, warping_Q16, length, order) \
+ (silk_warped_autocorrelation_FIX_c(corr, scale, input, warping_Q16, length, order))
+#endif
+
/* Calculation of LTP state scaling */
void silk_LTP_scale_ctrl_FIX(
silk_encoder_state_FIX *psEnc, /* I/O encoder state */
diff --git a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
index e803ef0..6916940 100644
--- a/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
+++ b/silk/fixed/mips/warped_autocorrelation_FIX_mipsr1.h
@@ -34,12 +34,6 @@ POSSIBILITY OF SUCH DAMAGE.
#include "main_FIX.h"
-#undef QC
-#define QC 10
-
-#undef QS
-#define QS 14
-
/* Autocorrelations for a warped frequency axis */
#define OVERRIDE_silk_warped_autocorrelation_FIX
void silk_warped_autocorrelation_FIX(
diff --git a/silk/fixed/warped_autocorrelation_FIX.c b/silk/fixed/warped_autocorrelation_FIX.c
index 6ca6c11..994c299 100644
--- a/silk/fixed/warped_autocorrelation_FIX.c
+++ b/silk/fixed/warped_autocorrelation_FIX.c
@@ -31,17 +31,13 @@ POSSIBILITY OF SUCH DAMAGE.
#include "main_FIX.h"
-#define QC 10
-#define QS 14
-
#if defined(MIPSr1_ASM)
#include "mips/warped_autocorrelation_FIX_mipsr1.h"
#endif
-#ifndef OVERRIDE_silk_warped_autocorrelation_FIX
/* Autocorrelations for a warped frequency axis */
-void silk_warped_autocorrelation_FIX(
+void silk_warped_autocorrelation_FIX_c(
opus_int32 *corr, /* O Result [order + 1] */
opus_int *scale, /* O Scaling of the correlation vector */
const opus_int16 *input, /* I Input data to correlate */
@@ -92,4 +88,3 @@ void silk_warped_autocorrelation_FIX(
}
silk_assert( corr_QC[ 0 ] >= 0 ); /* If breaking, decrease QC*/
}
-#endif /* OVERRIDE_silk_warped_autocorrelation_FIX */
diff --git a/silk/tests/test_unit_optimization_warped_autocorrelation.c b/silk/tests/test_unit_optimization_warped_autocorrelation.c
new file mode 100644
index 0000000..b7d0ad0
--- /dev/null
+++ b/silk/tests/test_unit_optimization_warped_autocorrelation.c
@@ -0,0 +1,441 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+#include "main_FIX.h"
+
+#ifdef FIXED_POINT
+
+/* Unrolling the input loop by 8 is about 25% faster than unrolling by 4 on Chromebook with an ARMv7 Processor. */
+#define NUM_PARALLEL_INPUTS 8
+
+/* Keep this function here because it is the code base to optimize on different CPUs. */
+void silk_warped_autocorrelation_FIX_c_opt(
+ opus_int32 *corr, /* O Result [order + 1] */
+ opus_int *scale, /* O Scaling of the correlation vector */
+ const opus_int16 *input, /* I Input data to correlate */
+ const opus_int warping_Q16, /* I Warping coefficient */
+ const opus_int length, /* I Length of input */
+ const opus_int order /* I Correlation order (even) */
+)
+{
+ opus_int n = 0, i, j, lsh;
+ opus_int32 tmp1_QS[NUM_PARALLEL_INPUTS], tmp2_QS[NUM_PARALLEL_INPUTS];
+ opus_int32 input_QS[NUM_PARALLEL_INPUTS];
+ opus_int32 state_QS_tmp[ MAX_SHAPE_LPC_ORDER + 2 ] = { 0 }; // Create one extra entry.
+ opus_int32 *state_QS = state_QS_tmp + 1; // Accessed one extra head entry in the last prolog and the last inner loop.
+ opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
+
+ /* Order must be even */
+ silk_assert( ( order & 1 ) == 0 );
+ silk_assert( 2 * QS - QC >= 0 );
+
+ /* Loop over samples */
+ if( order >= NUM_PARALLEL_INPUTS - 2 ) {
+ for( ; n < (length - NUM_PARALLEL_INPUTS + 1); n += NUM_PARALLEL_INPUTS ) {
+ for( i = 0; i < NUM_PARALLEL_INPUTS; i++ ) {
+ input_QS[i] = tmp1_QS[i] = silk_LSHIFT32( (opus_int32)input[ n + i ], QS );
+ }
+
+ /* Loop over allpass sections */
+
+ /* -------------------- prolog 0 -------------------- */
+
+ tmp2_QS[ 0 ] = state_QS[ order ];
+ state_QS[ order ] = tmp1_QS[ 0 ];
+ corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+ tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 1 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+ /* -------------------- prolog 1 -------------------- */
+
+ tmp2_QS[ 0 ] = state_QS[ order - 1 ];
+ tmp2_QS[ 1 ] = state_QS[ order ];
+
+ state_QS[ order - 1 ] = tmp1_QS[ 0 ];
+ state_QS[ order ] = tmp1_QS[ 1 ];
+
+ corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+ corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+
+ tmp1_QS[ 1 ] = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 1 ] - tmp1_QS[ 1 ], warping_Q16 );
+ tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 2 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+ /* -------------------- prolog 2 -------------------- */
+
+ tmp2_QS[ 0 ] = state_QS[ order - 2 ];
+ tmp2_QS[ 1 ] = state_QS[ order - 1 ];
+ tmp2_QS[ 2 ] = state_QS[ order ];
+
+ state_QS[ order - 2 ] = tmp1_QS[ 0 ];
+ state_QS[ order - 1 ] = tmp1_QS[ 1 ];
+ state_QS[ order ] = tmp1_QS[ 2 ];
+
+ corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+ corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+ corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+
+ tmp1_QS[ 2 ] = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 1 ] - tmp1_QS[ 2 ], warping_Q16 );
+ tmp1_QS[ 1 ] = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 2 ] - tmp1_QS[ 1 ], warping_Q16 );
+ tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 3 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+ /* -------------------- prolog 3 -------------------- */
+
+ tmp2_QS[ 0 ] = state_QS[ order - 3 ];
+ tmp2_QS[ 1 ] = state_QS[ order - 2 ];
+ tmp2_QS[ 2 ] = state_QS[ order - 1 ];
+ tmp2_QS[ 3 ] = state_QS[ order ];
+
+ state_QS[ order - 3 ] = tmp1_QS[ 0 ];
+ state_QS[ order - 2 ] = tmp1_QS[ 1 ];
+ state_QS[ order - 1 ] = tmp1_QS[ 2 ];
+ state_QS[ order ] = tmp1_QS[ 3 ];
+
+ corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+ corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+ corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+ corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+
+ tmp1_QS[ 3 ] = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 1 ] - tmp1_QS[ 3 ], warping_Q16 );
+ tmp1_QS[ 2 ] = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 2 ] - tmp1_QS[ 2 ], warping_Q16 );
+ tmp1_QS[ 1 ] = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 3 ] - tmp1_QS[ 1 ], warping_Q16 );
+ tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 4 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+ /* -------------------- prolog 4 -------------------- */
+
+ tmp2_QS[ 0 ] = state_QS[ order - 4 ];
+ tmp2_QS[ 1 ] = state_QS[ order - 3 ];
+ tmp2_QS[ 2 ] = state_QS[ order - 2 ];
+ tmp2_QS[ 3 ] = state_QS[ order - 1 ];
+ tmp2_QS[ 4 ] = state_QS[ order ];
+
+ state_QS[ order - 4 ] = tmp1_QS[ 0 ];
+ state_QS[ order - 3 ] = tmp1_QS[ 1 ];
+ state_QS[ order - 2 ] = tmp1_QS[ 2 ];
+ state_QS[ order - 1 ] = tmp1_QS[ 3 ];
+ state_QS[ order ] = tmp1_QS[ 4 ];
+
+ corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+ corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+ corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+ corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+ corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+
+ tmp1_QS[ 4 ] = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 1 ] - tmp1_QS[ 4 ], warping_Q16 );
+ tmp1_QS[ 3 ] = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 2 ] - tmp1_QS[ 3 ], warping_Q16 );
+ tmp1_QS[ 2 ] = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 3 ] - tmp1_QS[ 2 ], warping_Q16 );
+ tmp1_QS[ 1 ] = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 4 ] - tmp1_QS[ 1 ], warping_Q16 );
+ tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 5 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+ /* -------------------- prolog 5 -------------------- */
+
+ tmp2_QS[ 0 ] = state_QS[ order - 5 ];
+ tmp2_QS[ 1 ] = state_QS[ order - 4 ];
+ tmp2_QS[ 2 ] = state_QS[ order - 3 ];
+ tmp2_QS[ 3 ] = state_QS[ order - 2 ];
+ tmp2_QS[ 4 ] = state_QS[ order - 1 ];
+ tmp2_QS[ 5 ] = state_QS[ order ];
+
+ state_QS[ order - 5 ] = tmp1_QS[ 0 ];
+ state_QS[ order - 4 ] = tmp1_QS[ 1 ];
+ state_QS[ order - 3 ] = tmp1_QS[ 2 ];
+ state_QS[ order - 2 ] = tmp1_QS[ 3 ];
+ state_QS[ order - 1 ] = tmp1_QS[ 4 ];
+ state_QS[ order ] = tmp1_QS[ 5 ];
+
+ corr_QC[ order - 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+ corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+ corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+ corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+ corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+ corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+
+ tmp1_QS[ 5 ] = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ order - 1 ] - tmp1_QS[ 5 ], warping_Q16 );
+ tmp1_QS[ 4 ] = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 2 ] - tmp1_QS[ 4 ], warping_Q16 );
+ tmp1_QS[ 3 ] = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 3 ] - tmp1_QS[ 3 ], warping_Q16 );
+ tmp1_QS[ 2 ] = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 4 ] - tmp1_QS[ 2 ], warping_Q16 );
+ tmp1_QS[ 1 ] = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 5 ] - tmp1_QS[ 1 ], warping_Q16 );
+ tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 6 ] - tmp1_QS[ 0 ], warping_Q16 );
+
+ /* -------------------- prolog 6 -------------------- */
+
+ tmp2_QS[ 0 ] = state_QS[ order - 6 ];
+ tmp2_QS[ 1 ] = state_QS[ order - 5 ];
+ tmp2_QS[ 2 ] = state_QS[ order - 4 ];
+ tmp2_QS[ 3 ] = state_QS[ order - 3 ];
+ tmp2_QS[ 4 ] = state_QS[ order - 2 ];
+ tmp2_QS[ 5 ] = state_QS[ order - 1 ];
+ tmp2_QS[ 6 ] = state_QS[ order ];
+
+ state_QS[ order - 6 ] = tmp1_QS[ 0 ];
+ state_QS[ order - 5 ] = tmp1_QS[ 1 ];
+ state_QS[ order - 4 ] = tmp1_QS[ 2 ];
+ state_QS[ order - 3 ] = tmp1_QS[ 3 ];
+ state_QS[ order - 2 ] = tmp1_QS[ 4 ];
+ state_QS[ order - 1 ] = tmp1_QS[ 5 ];
+ state_QS[ order ] = tmp1_QS[ 6 ];
+
+ corr_QC[ order - 6 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+ corr_QC[ order - 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+ corr_QC[ order - 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+ corr_QC[ order - 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+ corr_QC[ order - 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+ corr_QC[ order - 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+ corr_QC[ order ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+
+ tmp1_QS[ 6 ] = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ order - 1 ] - tmp1_QS[ 6 ], warping_Q16 );
+ tmp1_QS[ 5 ] = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ order - 2 ] - tmp1_QS[ 5 ], warping_Q16 );
+ tmp1_QS[ 4 ] = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ order - 3 ] - tmp1_QS[ 4 ], warping_Q16 );
+ tmp1_QS[ 3 ] = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ order - 4 ] - tmp1_QS[ 3 ], warping_Q16 );
+ tmp1_QS[ 2 ] = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ order - 5 ] - tmp1_QS[ 2 ], warping_Q16 );
+ tmp1_QS[ 1 ] = silk_SMLAWB( tmp2_QS[ 1 ], state_QS[ order - 6 ] - tmp1_QS[ 1 ], warping_Q16 );
+ tmp1_QS[ 0 ] = silk_SMLAWB( tmp2_QS[ 0 ], state_QS[ order - 7 ] - tmp1_QS[ 0 ], warping_Q16 ); // Accessed one extra head entry when order is 6.
+
+ /* -------------------- kernel loop -------------------- */
+
+ for( i = 0; i < order - NUM_PARALLEL_INPUTS + 2; i++ ) {
+ /* Output of allpass section */
+ for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) {
+ tmp2_QS[ j ] = state_QS[ order - i - NUM_PARALLEL_INPUTS + 1 + j ];
+ }
+
+ for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) {
+ state_QS[ order - i - NUM_PARALLEL_INPUTS + 1 + j ] = tmp1_QS[ j ];
+ }
+
+ for( j = 0; j < NUM_PARALLEL_INPUTS; j++ ) {
+ corr_QC[ order - i - NUM_PARALLEL_INPUTS + 1 + j ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ j ], input_QS[ j ] ), 2 * QS - QC );
+ }
+
+ for( j = NUM_PARALLEL_INPUTS - 1; j >= 0; j-- ) {
+ tmp1_QS[ j ] = silk_SMLAWB( tmp2_QS[ j ], state_QS[ order - i - NUM_PARALLEL_INPUTS + j ] - tmp1_QS[ j ], warping_Q16 ); // Accessed one extra head entry in the last loop.
+ }
+ }
+
+ /* -------------------- epilog 0 -------------------- */
+
+ tmp2_QS[ 2 ] = state_QS[ 1 ];
+ tmp2_QS[ 3 ] = state_QS[ 2 ];
+ tmp2_QS[ 4 ] = state_QS[ 3 ];
+ tmp2_QS[ 5 ] = state_QS[ 4 ];
+ tmp2_QS[ 6 ] = state_QS[ 5 ];
+ tmp2_QS[ 7 ] = state_QS[ 6 ];
+
+ state_QS[ 0 ] = tmp1_QS[ 1 ];
+ state_QS[ 1 ] = tmp1_QS[ 2 ];
+ state_QS[ 2 ] = tmp1_QS[ 3 ];
+ state_QS[ 3 ] = tmp1_QS[ 4 ];
+ state_QS[ 4 ] = tmp1_QS[ 5 ];
+ state_QS[ 5 ] = tmp1_QS[ 6 ];
+ state_QS[ 6 ] = tmp1_QS[ 7 ];
+
+ corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 1 ], input_QS[ 1 ] ), 2 * QS - QC );
+ corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+ corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+ corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+ corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+ corr_QC[ 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+ corr_QC[ 6 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+ tmp1_QS[ 7 ] = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 5 ] - tmp1_QS[ 7 ], warping_Q16 );
+ tmp1_QS[ 6 ] = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 4 ] - tmp1_QS[ 6 ], warping_Q16 );
+ tmp1_QS[ 5 ] = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 3 ] - tmp1_QS[ 5 ], warping_Q16 );
+ tmp1_QS[ 4 ] = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 2 ] - tmp1_QS[ 4 ], warping_Q16 );
+ tmp1_QS[ 3 ] = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ 1 ] - tmp1_QS[ 3 ], warping_Q16 );
+ tmp1_QS[ 2 ] = silk_SMLAWB( tmp2_QS[ 2 ], state_QS[ 0 ] - tmp1_QS[ 2 ], warping_Q16 );
+
+ /* -------------------- epilog 1 -------------------- */
+
+ tmp2_QS[ 3 ] = state_QS[ 1 ];
+ tmp2_QS[ 4 ] = state_QS[ 2 ];
+ tmp2_QS[ 5 ] = state_QS[ 3 ];
+ tmp2_QS[ 6 ] = state_QS[ 4 ];
+ tmp2_QS[ 7 ] = state_QS[ 5 ];
+
+ state_QS[ 0 ] = tmp1_QS[ 2 ];
+ state_QS[ 1 ] = tmp1_QS[ 3 ];
+ state_QS[ 2 ] = tmp1_QS[ 4 ];
+ state_QS[ 3 ] = tmp1_QS[ 5 ];
+ state_QS[ 4 ] = tmp1_QS[ 6 ];
+ state_QS[ 5 ] = tmp1_QS[ 7 ];
+
+ corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 2 ], input_QS[ 2 ] ), 2 * QS - QC );
+ corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+ corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+ corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+ corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+ corr_QC[ 5 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+ tmp1_QS[ 7 ] = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 4 ] - tmp1_QS[ 7 ], warping_Q16 );
+ tmp1_QS[ 6 ] = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 3 ] - tmp1_QS[ 6 ], warping_Q16 );
+ tmp1_QS[ 5 ] = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 2 ] - tmp1_QS[ 5 ], warping_Q16 );
+ tmp1_QS[ 4 ] = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 1 ] - tmp1_QS[ 4 ], warping_Q16 );
+ tmp1_QS[ 3 ] = silk_SMLAWB( tmp2_QS[ 3 ], state_QS[ 0 ] - tmp1_QS[ 3 ], warping_Q16 );
+
+ /* -------------------- epilog 2 -------------------- */
+
+ tmp2_QS[ 4 ] = state_QS[ 1 ];
+ tmp2_QS[ 5 ] = state_QS[ 2 ];
+ tmp2_QS[ 6 ] = state_QS[ 3 ];
+ tmp2_QS[ 7 ] = state_QS[ 4 ];
+
+ state_QS[ 0 ] = tmp1_QS[ 3 ];
+ state_QS[ 1 ] = tmp1_QS[ 4 ];
+ state_QS[ 2 ] = tmp1_QS[ 5 ];
+ state_QS[ 3 ] = tmp1_QS[ 6 ];
+ state_QS[ 4 ] = tmp1_QS[ 7 ];
+
+ corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 3 ], input_QS[ 3 ] ), 2 * QS - QC );
+ corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+ corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+ corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+ corr_QC[ 4 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+ tmp1_QS[ 7 ] = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 3 ] - tmp1_QS[ 7 ], warping_Q16 );
+ tmp1_QS[ 6 ] = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 2 ] - tmp1_QS[ 6 ], warping_Q16 );
+ tmp1_QS[ 5 ] = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 1 ] - tmp1_QS[ 5 ], warping_Q16 );
+ tmp1_QS[ 4 ] = silk_SMLAWB( tmp2_QS[ 4 ], state_QS[ 0 ] - tmp1_QS[ 4 ], warping_Q16 );
+
+ /* -------------------- epilog 3 -------------------- */
+
+ tmp2_QS[ 5 ] = state_QS[ 1 ];
+ tmp2_QS[ 6 ] = state_QS[ 2 ];
+ tmp2_QS[ 7 ] = state_QS[ 3 ];
+
+ state_QS[ 0 ] = tmp1_QS[ 4 ];
+ state_QS[ 1 ] = tmp1_QS[ 5 ];
+ state_QS[ 2 ] = tmp1_QS[ 6 ];
+ state_QS[ 3 ] = tmp1_QS[ 7 ];
+
+ corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 4 ], input_QS[ 4 ] ), 2 * QS - QC );
+ corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+ corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+ corr_QC[ 3 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+ tmp1_QS[ 7 ] = silk_SMLAWB( tmp2_QS[ 7 ], state_QS[ 2 ] - tmp1_QS[ 7 ], warping_Q16 );
+ tmp1_QS[ 6 ] = silk_SMLAWB( tmp2_QS[ 6 ], state_QS[ 1 ] - tmp1_QS[ 6 ], warping_Q16 );
+ tmp1_QS[ 5 ] = silk_SMLAWB( tmp2_QS[ 5 ], state_QS[ 0 ] - tmp1_QS[ 5 ], warping_Q16 );
+
+ /* -------------------- epilog 4 -------------------- */
+
+ corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 5 ], input_QS[ 5 ] ), 2 * QS - QC );
+ corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 6 ], input_QS[ 6 ] ), 2 * QS - QC );
+ corr_QC[ 2 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 7 ], input_QS[ 7 ] ), 2 * QS - QC );
+
+ opus_int32 tmp1_QS_2 = silk_SMLAWB( state_QS[ 1 ], tmp1_QS[ 5 ] - tmp1_QS[ 6 ], warping_Q16 );
+ state_QS[ 1 ] = silk_SMLAWB( state_QS[ 2 ], tmp1_QS[ 6 ] - tmp1_QS[ 7 ], warping_Q16 );
+
+ /* -------------------- epilog 5 & 6 -------------------- */
+
+ state_QS[ 0 ] = silk_SMLAWB( tmp1_QS[ 6 ], tmp1_QS_2 - state_QS[ 1 ], warping_Q16 );
+ state_QS[ 2 ] = tmp1_QS[ 7 ];
+
+ corr_QC[ 0 ] += silk_RSHIFT64( silk_SMULL( tmp1_QS_2, input_QS[ 6 ] ), 2 * QS - QC )
+ + silk_RSHIFT64( silk_SMULL( state_QS[ 0 ], input_QS[ 7 ] ), 2 * QS - QC );
+ corr_QC[ 1 ] += silk_RSHIFT64( silk_SMULL( state_QS[ 1 ], input_QS[ 7 ] ), 2 * QS - QC );
+ }
+ }
+
+ for( ; n < length; n++ ) {
+ input_QS[ 0 ] = tmp1_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS );
+ /* Loop over allpass sections */
+ for( i = 0; i <= order; i++ ) {
+ /* Output of allpass section */
+ tmp2_QS[ 0 ] = silk_SMLAWB( state_QS[ order - i ], state_QS[ order - i - 1 ] - tmp1_QS[ 0 ], warping_Q16 );
+ state_QS[ order - i ] = tmp1_QS[ 0 ];
+ corr_QC[ order - i ] += silk_RSHIFT64( silk_SMULL( tmp1_QS[ 0 ], input_QS[ 0 ] ), 2 * QS - QC );
+ tmp1_QS[ 0 ] = tmp2_QS[ 0 ];
+ }
+ }
+ lsh = silk_CLZ64( corr_QC[ order ] ) - 35;
+ lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
+ *scale = -( QC + lsh );
+ silk_assert( *scale >= -30 && *scale <= 12 );
+ if( lsh >= 0 ) {
+ for( i = 0; i <= order; i++ ) {
+ corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QC[ i ], lsh ) );
+ }
+ } else {
+ for( i = 0; i <= order; i++ ) {
+ corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QC[ i ], -lsh ) );
+ }
+ }
+ silk_assert( corr_QC[ order ] >= 0 ); /* If breaking, decrease QC*/
+}
+
+#define MAX_LENGTH 360
+
+static int test_warped_autocorrelation(int arch)
+{
+ unsigned int i;
+ opus_int32 corrOrg[MAX_SHAPE_LPC_ORDER + 1], corrOpt[MAX_SHAPE_LPC_ORDER + 1];
+ opus_int scaleOrg, scaleOpt;
+ opus_int16 input[MAX_LENGTH];
+ opus_int warping_Q16, length, order;
+ (void)arch;
+
+ printf("%50s", "silk_warped_autocorrelation_FIX() ...");
+ for( order = 0; order <= MAX_SHAPE_LPC_ORDER; order += 2 ) // order must be even.
+ {
+ for( length = 0; length <= MAX_LENGTH; length++ )
+ {
+ for (i=0;i<MAX_LENGTH;++i)
+ {
+ input[i] = (rand() % 32767) - 16384;
+ }
+ warping_Q16 = rand() % 32767;
+ memcpy(corrOpt, corrOrg, sizeof(corrOrg));
+
+ silk_warped_autocorrelation_FIX_c(corrOrg, &scaleOrg, input, warping_Q16, length, order);
+ silk_warped_autocorrelation_FIX (corrOpt, &scaleOpt, input, warping_Q16, length, order);
+ if (memcmp(corrOpt, corrOrg, sizeof(corrOrg)))
+ {
+ printf("order=%2d length=%3d failed!\n", order, length);
+ for (i=0;i<sizeof(corrOrg) / sizeof(*corrOrg);i++)
+ {
+ if (corrOrg[i] != corrOpt[i])
+ {
+ printf("\ncorrOrg[%3d]=%12d, corrOpt[%3d]=%12d", i, corrOrg[i], i, corrOpt[i]);
+ }
+ }
+ printf("\n");
+ return -1;
+ }
+ }
+ }
+ printf(" passed!\n");
+ return 0;
+}
+#endif /* FIXED_POINT */
diff --git a/silk_headers.mk b/silk_headers.mk
index f8bf1d2..c826db9 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -28,6 +28,7 @@ silk/arm/macros_arm64.h \
silk/arm/SigProc_FIX_armv4.h \
silk/arm/SigProc_FIX_armv5e.h \
silk/arm/NSQ_neon.h \
+silk/arm/warped_autocorrelation_FIX_arm.h \
silk/fixed/main_FIX.h \
silk/fixed/structs_FIX.h \
silk/fixed/mips/noise_shape_analysis_FIX_mipsr1.h \
diff --git a/silk_sources.mk b/silk_sources.mk
index 7229ee3..62d2cdd 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -84,7 +84,8 @@ silk/x86/VQ_WMat_EC_sse.c
SILK_SOURCES_ARM_NEON_INTR = \
silk/arm/arm_silk_map.c \
-silk/arm/NSQ_neon.c
+silk/arm/NSQ_neon.c \
+silk/arm/warped_autocorrelation_FIX_neon_intr.c
SILK_SOURCES_FIXED = \
silk/fixed/LTP_analysis_filter_FIX.c \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
new file mode 100644
index 0000000..b5c25d9
--- /dev/null
+++ b/tests/test_unit_optimization.c
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <stdio.h>
+#include "stack_alloc.h"
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#ifdef FIXED_POINT
+
+# include "celt/tests/test_unit_optimization_lpc.c"
+# include "silk/tests/test_unit_optimization_warped_autocorrelation.c"
+
+#endif
+
+int main(void)
+{
+ int result = 0; /* 0: passed; other: failed */
+ ALLOC_STACK;
+#ifdef FIXED_POINT
+ int arch = opus_select_arch();
+#endif /* FIXED_POINT */
+ int count = 10;
+
+ while (!result && count--) {
+ printf("\n--------------------------- Testing optimization ---------------------------\n");
+#ifdef FIXED_POINT
+ result |= test_fir(arch);
+ result |= test_warped_autocorrelation(arch);
+#endif /* FIXED_POINT */
+ }
+ return result;
+}
--
2.8.0.rc3.226.g39d4020
More information about the opus
mailing list