[opus] [PATCH 5/5] Optimize silk/LPC_inv_pred_gain.c for ARM NEON
Linfeng Zhang
linfengz at google.com
Thu Jul 14 00:49:02 UTC 2016
Optimized LPC_inverse_pred_gain_QA(), silk_LPC_inverse_pred_gain() and
silk_LPC_inverse_pred_gain_Q24() for ARM NEON.
Created corresponding unit test.
---
silk/CNG.c | 2 +-
silk/LPC_inv_pred_gain.c | 18 +-
silk/NLSF2A.c | 3 +-
silk/SigProc_FIX.h | 19 +-
silk/arm/LPC_inv_pred_gain_arm.h | 84 +++++++
silk/arm/LPC_inv_pred_gain_neon_intr.c | 258 +++++++++++++++++++++
silk/arm/arm_silk_map.c | 24 +-
silk/decode_parameters.c | 4 +-
silk/fixed/find_LPC_FIX.c | 2 +-
silk/float/find_LPC_FLP.c | 2 +-
silk/float/main_FLP.h | 3 +-
silk/float/wrappers_FLP.c | 5 +-
silk/init_decoder.c | 1 +
silk/process_NLSFs.c | 4 +-
silk/structs.h | 1 +
.../test_unit_optimization_LPC_inv_pred_gain.c | 107 +++++++++
silk_headers.mk | 1 +
silk_sources.mk | 1 +
tests/test_unit_optimization.c | 9 +-
19 files changed, 523 insertions(+), 25 deletions(-)
create mode 100644 silk/arm/LPC_inv_pred_gain_arm.h
create mode 100644 silk/arm/LPC_inv_pred_gain_neon_intr.c
create mode 100644 silk/tests/test_unit_optimization_LPC_inv_pred_gain.c
diff --git a/silk/CNG.c b/silk/CNG.c
index 8443ad6..78d500a 100644
--- a/silk/CNG.c
+++ b/silk/CNG.c
@@ -142,7 +142,7 @@ void silk_CNG(
silk_CNG_exc( CNG_sig_Q14 + MAX_LPC_ORDER, psCNG->CNG_exc_buf_Q14, length, &psCNG->rand_seed );
/* Convert CNG NLSF to filter representation */
- silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order );
+ silk_NLSF2A( A_Q12, psCNG->CNG_smth_NLSF_Q15, psDec->LPC_order, psDec->arch );
/* Generate CNG signal, by synthesis filtering */
silk_memcpy( CNG_sig_Q14, psCNG->CNG_synth_state, MAX_LPC_ORDER * sizeof( opus_int32 ) );
diff --git a/silk/LPC_inv_pred_gain.c b/silk/LPC_inv_pred_gain.c
index 4af89aa..64747ad 100644
--- a/silk/LPC_inv_pred_gain.c
+++ b/silk/LPC_inv_pred_gain.c
@@ -36,9 +36,11 @@ POSSIBILITY OF SUCH DAMAGE.
#define MUL32_FRAC_Q(a32, b32, Q) ((opus_int32)(silk_RSHIFT_ROUND64(silk_SMULL(a32, b32), Q)))
-/* Compute inverse of LPC prediction gain, and */
-/* test if LPC coefficients are stable (all poles within unit circle) */
-static opus_int32 LPC_inverse_pred_gain_QA( /* O Returns inverse prediction gain in energy domain, Q30 */
+/* Compute inverse of LPC prediction gain, and */
+/* test if LPC coefficients are stable (all poles within unit circle) */
+/* Note that specific platforms' optimizations don't guarantee identical A_QA buffer. */
+/* Since the intermediate A_QA buffer is never used again in the caller functions, that's fine. */
+static opus_int32 LPC_inverse_pred_gain_QA_c( /* O Returns inverse prediction gain in energy domain, Q30 */
opus_int32 A_QA[ 2 ][ SILK_MAX_ORDER_LPC ], /* I Prediction coefficients */
const opus_int order /* I Prediction order */
)
@@ -106,7 +108,7 @@ static opus_int32 LPC_inverse_pred_gain_QA( /* O Returns inver
}
/* For input in Q12 domain */
-opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse prediction gain in energy domain, Q30 */
+opus_int32 silk_LPC_inverse_pred_gain_c( /* O Returns inverse prediction gain in energy domain, Q30 */
const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
const opus_int order /* I Prediction order */
)
@@ -127,13 +129,14 @@ opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse predi
if( DC_resp >= 4096 ) {
return 0;
}
- return LPC_inverse_pred_gain_QA( Atmp_QA, order );
+ return LPC_inverse_pred_gain_QA_c( Atmp_QA, order );
+ /* Don't use Atmp_QA buffer anymore from here, because specific platforms' optimizations don't guarantee identical values. */
}
#ifdef FIXED_POINT
/* For input in Q24 domain */
-opus_int32 silk_LPC_inverse_pred_gain_Q24( /* O Returns inverse prediction gain in energy domain, Q30 */
+opus_int32 silk_LPC_inverse_pred_gain_Q24_c( /* O Returns inverse prediction gain in energy domain, Q30 */
const opus_int32 *A_Q24, /* I Prediction coefficients [order] */
const opus_int order /* I Prediction order */
)
@@ -149,6 +152,7 @@ opus_int32 silk_LPC_inverse_pred_gain_Q24( /* O Returns inverse pred
Anew_QA[ k ] = silk_RSHIFT32( A_Q24[ k ], 24 - QA );
}
- return LPC_inverse_pred_gain_QA( Atmp_QA, order );
+ return LPC_inverse_pred_gain_QA_c( Atmp_QA, order );
+ /* Don't use Atmp_QA buffer anymore from here, because specific platforms' optimizations don't guarantee identical values. */
}
#endif
diff --git a/silk/NLSF2A.c b/silk/NLSF2A.c
index b1c559e..a259212 100644
--- a/silk/NLSF2A.c
+++ b/silk/NLSF2A.c
@@ -66,7 +66,8 @@ static OPUS_INLINE void silk_NLSF2A_find_poly(
void silk_NLSF2A(
opus_int16 *a_Q12, /* O monic whitening filter coefficients in Q12, [ d ] */
const opus_int16 *NLSF, /* I normalized line spectral frequencies in Q15, [ d ] */
- const opus_int d /* I filter order (should be even) */
+ const opus_int d, /* I filter order (should be even) */
+ int arch /* I Run-time architecture */
)
{
/* This ordering was found to maximize quality. It improves numerical accuracy of
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index b632994..570ae11 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -47,6 +47,10 @@ extern "C"
#include "x86/SigProc_FIX_sse.h"
#endif
+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/LPC_inv_pred_gain_arm.h"
+#endif
+
/********************************************************************/
/* SIGNAL PROCESSING FUNCTIONS */
/********************************************************************/
@@ -132,13 +136,13 @@ void silk_bwexpander_32(
/* Compute inverse of LPC prediction gain, and */
/* test if LPC coefficients are stable (all poles within unit circle) */
-opus_int32 silk_LPC_inverse_pred_gain( /* O Returns inverse prediction gain in energy domain, Q30 */
+opus_int32 silk_LPC_inverse_pred_gain_c( /* O Returns inverse prediction gain in energy domain, Q30 */
const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
const opus_int order /* I Prediction order */
);
/* For input in Q24 domain */
-opus_int32 silk_LPC_inverse_pred_gain_Q24( /* O Returns inverse prediction gain in energy domain, Q30 */
+opus_int32 silk_LPC_inverse_pred_gain_Q24_c( /* O Returns inverse prediction gain in energy domain, Q30 */
const opus_int32 *A_Q24, /* I Prediction coefficients [order] */
const opus_int order /* I Prediction order */
);
@@ -152,6 +156,14 @@ void silk_ana_filt_bank_1(
const opus_int32 N /* I Number of input samples */
);
+#if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)
+#define silk_LPC_inverse_pred_gain(A_Q12, order) ((void)(arch),silk_LPC_inverse_pred_gain_c(A_Q12, order))
+#endif
+
+#if !defined(OVERRIDE_silk_LPC_inverse_pred_gain_Q24)
+#define silk_LPC_inverse_pred_gain_Q24(A_Q24, order) ((void)(arch),silk_LPC_inverse_pred_gain_Q24_c(A_Q24, order))
+#endif
+
/********************************************************************/
/* SCALAR FUNCTIONS */
/********************************************************************/
@@ -271,7 +283,8 @@ void silk_A2NLSF(
void silk_NLSF2A(
opus_int16 *a_Q12, /* O monic whitening filter coefficients in Q12, [ d ] */
const opus_int16 *NLSF, /* I normalized line spectral frequencies in Q15, [ d ] */
- const opus_int d /* I filter order (should be even) */
+ const opus_int d, /* I filter order (should be even) */
+ int arch /* I Run-time architecture */
);
void silk_insertion_sort_increasing(
diff --git a/silk/arm/LPC_inv_pred_gain_arm.h b/silk/arm/LPC_inv_pred_gain_arm.h
new file mode 100644
index 0000000..77d7167
--- /dev/null
+++ b/silk/arm/LPC_inv_pred_gain_arm.h
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(LPC_INV_PRED_GAIN_ARM_H)
+# define LPC_INV_PRED_GAIN_ARM_H
+
+# include "celt/arm/armcpu.h"
+
+# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
+ const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
+ const opus_int order /* I Prediction order */
+);
+# endif
+
+# if !defined(OPUS_HAVE_RTCD)
+# define OVERRIDE_silk_LPC_inverse_pred_gain (1)
+# define silk_LPC_inverse_pred_gain(A_Q12, order) ((void)(arch),PRESUME_NEON(silk_LPC_inverse_pred_gain)(A_Q12, order))
+# endif
+
+# if !defined(OVERRIDE_silk_LPC_inverse_pred_gain)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK+1])(const opus_int16 *A_Q12, const opus_int order);
+# define OVERRIDE_silk_LPC_inverse_pred_gain (1)
+# define silk_LPC_inverse_pred_gain(A_Q12, order) ((*SILK_LPC_INVERSE_PRED_GAIN_IMPL[(arch)&OPUS_ARCHMASK])(A_Q12, order))
+# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+# define OVERRIDE_silk_LPC_inverse_pred_gain (1)
+# define silk_LPC_inverse_pred_gain(A_Q12, order) ((void)(arch),silk_LPC_inverse_pred_gain_neon(A_Q12, order))
+# endif
+# endif
+
+# if defined(FIXED_POINT)
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON)
+opus_int32 silk_LPC_inverse_pred_gain_Q24_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
+ const opus_int32 *A_Q24, /* I Prediction coefficients [order] */
+ const opus_int order /* I Prediction order */
+);
+# endif
+
+# if !defined(OPUS_HAVE_RTCD)
+# define OVERRIDE_silk_LPC_inverse_pred_gain_Q24 (1)
+# define silk_LPC_inverse_pred_gain_Q24(A_Q24, order) ((void)(arch),PRESUME_NEON(silk_LPC_inverse_pred_gain_Q24)(A_Q24, order))
+# endif
+
+# if !defined(OVERRIDE_silk_LPC_inverse_pred_gain_Q24)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_Q24_IMPL[OPUS_ARCHMASK+1])(const opus_int32 *A_Q24, const opus_int order);
+# define OVERRIDE_silk_LPC_inverse_pred_gain_Q24 (1)
+# define silk_LPC_inverse_pred_gain_Q24(A_Q24, order) ((*SILK_LPC_INVERSE_PRED_GAIN_Q24_IMPL[(arch)&OPUS_ARCHMASK])(A_Q24, order))
+# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+# define OVERRIDE_silk_LPC_inverse_pred_gain_Q24 (1)
+# define silk_LPC_inverse_pred_gain_Q24(A_Q24, order) ((void)(arch),silk_LPC_inverse_pred_gain_Q24_neon(A_Q24, order))
+# endif
+# endif
+
+# endif /* end FIXED_POINT */
+
+#endif /* end LPC_INV_PRED_GAIN_ARM_H */
diff --git a/silk/arm/LPC_inv_pred_gain_neon_intr.c b/silk/arm/LPC_inv_pred_gain_neon_intr.c
new file mode 100644
index 0000000..29f0e57
--- /dev/null
+++ b/silk/arm/LPC_inv_pred_gain_neon_intr.c
@@ -0,0 +1,258 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+ @file warped_autocorrelation_FIX_neon_intr.c
+ @brief ARM Neon Intrinsic optimizations for silk silk_warped_autocorrelation_FIX functions
+ */
+
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+#include <arm_neon.h>
+#include "stack_alloc.h"
+#include "main_FIX.h"
+
+#define QA 24
+#define A_LIMIT SILK_FIX_CONST( 0.99975, QA )
+
+/* Compute inverse of LPC prediction gain, and */
+/* test if LPC coefficients are stable (all poles within unit circle) */
+/* Note that this NEON optimization doesn't guarantee identical A_QA buffer. */
+/* Since the intermediate A_QA buffer is never used again in the caller functions, that's fine. */
+static opus_int32 LPC_inverse_pred_gain_QA_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
+ opus_int32 A_QA[ 2 ][ SILK_MAX_ORDER_LPC ], /* I Prediction coefficients */
+ const opus_int order /* I Prediction order */
+)
+{
+ opus_int k, n, mult2Q;
+ opus_int32 invGain_Q30, rc_Q31, rc_mult1_Q30, rc_mult2;
+ opus_int32 *Aold_QA, *Anew_QA;
+
+ Anew_QA = A_QA[ order & 1 ];
+
+ invGain_Q30 = (opus_int32)1 << 30;
+ for( k = order - 1; k > 0; k-- ) {
+ int32x2_t rc_Q31_s32x2, rc_mult2_s32x2;
+ int64x2_t mult2Q_s64x2;
+
+ /* Check for stability */
+ if( ( Anew_QA[ k ] > A_LIMIT ) || ( Anew_QA[ k ] < -A_LIMIT ) ) {
+ return 0;
+ }
+
+ /* Set RC equal to negated AR coef */
+ rc_Q31 = -silk_LSHIFT( Anew_QA[ k ], 31 - QA );
+
+ /* rc_mult1_Q30 range: [ 1 : 2^30 ] */
+ rc_mult1_Q30 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
+ silk_assert( rc_mult1_Q30 > ( 1 << 15 ) ); /* reduce A_LIMIT if fails */
+ silk_assert( rc_mult1_Q30 <= ( 1 << 30 ) );
+
+ /* rc_mult2 range: [ 2^30 : silk_int32_MAX ] */
+ mult2Q = 32 - silk_CLZ32( silk_abs( rc_mult1_Q30 ) );
+ rc_mult2 = silk_INVERSE32_varQ( rc_mult1_Q30, mult2Q + 30 );
+
+ /* Update inverse gain */
+ /* invGain_Q30 range: [ 0 : 2^30 ] */
+ invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );
+ silk_assert( invGain_Q30 >= 0 );
+ silk_assert( invGain_Q30 <= ( 1 << 30 ) );
+
+ /* Swap pointers */
+ Aold_QA = Anew_QA;
+ Anew_QA = A_QA[ k & 1 ];
+
+ /* Update AR coefficient */
+ rc_Q31_s32x2 = vdup_n_s32(rc_Q31);
+ mult2Q_s64x2 = vdupq_n_s64(-mult2Q);
+ rc_mult2_s32x2 = vdup_n_s32(rc_mult2);
+
+ for( n = 0; n < k; n += 4 ) {
+ /* We always calculate extra elements of A_QA buffer when (k % 4) != 0, to take the advantage of SIMD parallelization. */
+ int32x4_t Aold_QA_s32x4, Aold_QAr_s32x4, t_s32x4, tmp_QA_s32x4;
+ int64x2_t tmp0_s64x2, tmp1_s64x2;
+ Aold_QA_s32x4 = vld1q_s32(Aold_QA + n);
+ Aold_QAr_s32x4 = vld1q_s32(Aold_QA + k - n - 4);
+ Aold_QAr_s32x4 = vrev64q_s32(Aold_QAr_s32x4);
+ Aold_QAr_s32x4 = vcombine_s32(vget_high_s32(Aold_QAr_s32x4), vget_low_s32(Aold_QAr_s32x4)); // Compiler should generate VSWP.
+ t_s32x4 = vqrdmulhq_lane_s32(Aold_QAr_s32x4, rc_Q31_s32x2, 0);
+ tmp_QA_s32x4 = vsubq_s32(Aold_QA_s32x4, t_s32x4);
+ tmp0_s64x2 = vmull_s32(vget_low_s32 (tmp_QA_s32x4), rc_mult2_s32x2);
+ tmp1_s64x2 = vmull_s32(vget_high_s32(tmp_QA_s32x4), rc_mult2_s32x2);
+ tmp0_s64x2 = vrshlq_s64(tmp0_s64x2, mult2Q_s64x2);
+ tmp1_s64x2 = vrshlq_s64(tmp1_s64x2, mult2Q_s64x2);
+ t_s32x4 = vcombine_s32(vmovn_s64(tmp0_s64x2), vmovn_s64(tmp1_s64x2));
+ vst1q_s32(Anew_QA + n, t_s32x4);
+ }
+ }
+
+ /* Check for stability */
+ if( ( Anew_QA[ 0 ] > A_LIMIT ) || ( Anew_QA[ 0 ] < -A_LIMIT ) ) {
+ return 0;
+ }
+
+ /* Set RC equal to negated AR coef */
+ rc_Q31 = -silk_LSHIFT( Anew_QA[ 0 ], 31 - QA );
+
+ /* Range: [ 1 : 2^30 ] */
+ rc_mult1_Q30 = ( (opus_int32)1 << 30 ) - silk_SMMUL( rc_Q31, rc_Q31 );
+
+ /* Update inverse gain */
+ /* Range: [ 0 : 2^30 ] */
+ invGain_Q30 = silk_LSHIFT( silk_SMMUL( invGain_Q30, rc_mult1_Q30 ), 2 );
+ silk_assert( invGain_Q30 >= 0 );
+ silk_assert( invGain_Q30 <= 1<<30 );
+
+ return invGain_Q30;
+}
+
+/* For input in Q12 domain */
+opus_int32 silk_LPC_inverse_pred_gain_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
+ const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
+ const opus_int order /* I Prediction order */
+)
+{
+ opus_int32 Atmp_QA[ 2 ][ SILK_MAX_ORDER_LPC ];
+ opus_int32 DC_resp = 0;
+ int16x8_t t0_s16x8, t1_s16x8;
+ int32x4_t t0_s32x4;
+
+ /* Increase Q domain of the AR coefficients */
+ silk_assert(!(order & 1)); // order is even
+ silk_assert(SILK_MAX_ORDER_LPC <= 16);
+ t0_s16x8 = vld1q_s16(A_Q12);
+ t1_s16x8 = vld1q_s16(A_Q12 + 8);
+ t0_s32x4 = vpaddlq_s16(t0_s16x8);
+ switch( order )
+ {
+ case 16:
+ {
+ int32x2_t t_s32x2;
+ int64x1_t t_s64x1;
+ t0_s32x4 = vpadalq_s16(t0_s32x4, t1_s16x8);
+ t_s32x2 = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4));
+ t_s64x1 = vpaddl_s32(t_s32x2);
+ DC_resp = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0);
+ }
+ break;
+
+ case 14:
+ {
+ int64x1_t t_s64x1;
+ int32x4_t t1_s32x4 = vpaddlq_s16(t1_s16x8);
+ int32x2_t t_s32x2 = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4));
+ t_s32x2 = vadd_s32(t_s32x2, vget_low_s32(t1_s32x4));
+ t_s64x1 = vpaddl_s32(t_s32x2);
+ t_s64x1 = vreinterpret_s64_s32(vadd_s32(vreinterpret_s32_s64(t_s64x1), vget_high_s32(t1_s32x4)));
+ DC_resp = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0);
+ }
+ break;
+
+ case 12:
+ {
+ int64x1_t t_s64x1;
+ int32x2_t t0_s32x2 = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4));
+ int32x2_t t1_s32x2 = vpaddl_s16(vget_low_s16(t1_s16x8));
+ t0_s32x2 = vadd_s32(t0_s32x2, t1_s32x2);
+ t_s64x1 = vpaddl_s32(t0_s32x2);
+ DC_resp = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0);
+ }
+ break;
+
+ case 10:
+ {
+ int32x2_t t0_s32x2 = vpadd_s32(vget_low_s32(t0_s32x4), vget_high_s32(t0_s32x4));
+ int32x2_t t1_s32x2 = vpaddl_s16(vget_low_s16(t1_s16x8));
+ int64x1_t t_s64x1 = vpaddl_s32(t0_s32x2);
+ t_s64x1 = vreinterpret_s64_s32(vadd_s32(vreinterpret_s32_s64(t_s64x1), t1_s32x2));
+ DC_resp = vget_lane_s32(vreinterpret_s32_s64(t_s64x1), 0);
+ }
+ break;
+
+ case 8:
+ DC_resp += (opus_int32)A_Q12[ 7 ];
+ DC_resp += (opus_int32)A_Q12[ 6 ];
+
+ case 6:
+ DC_resp += (opus_int32)A_Q12[ 5 ];
+ DC_resp += (opus_int32)A_Q12[ 4 ];
+
+ case 4:
+ DC_resp += (opus_int32)A_Q12[ 3 ];
+ DC_resp += (opus_int32)A_Q12[ 2 ];
+
+ case 2:
+ DC_resp += (opus_int32)A_Q12[ 1 ];
+ DC_resp += (opus_int32)A_Q12[ 0 ];
+
+ default:
+ break;
+ }
+
+ /* If the DC is unstable, we don't even need to do the full calculations */
+ if( DC_resp >= 4096 ) {
+ return 0;
+ }
+ vst1q_s32(Atmp_QA[ 0 ], vshll_n_s16(vget_low_s16 (t0_s16x8), QA - 12));
+ vst1q_s32(Atmp_QA[ 0 ] + 4, vshll_n_s16(vget_high_s16(t0_s16x8), QA - 12));
+ vst1q_s32(Atmp_QA[ 0 ] + 8, vshll_n_s16(vget_low_s16 (t1_s16x8), QA - 12));
+ vst1q_s32(Atmp_QA[ 0 ] + 12, vshll_n_s16(vget_high_s16(t1_s16x8), QA - 12));
+
+ return LPC_inverse_pred_gain_QA_neon( Atmp_QA, order );
+}
+
+#ifdef FIXED_POINT
+
+/* For input in Q24 domain */
+opus_int32 silk_LPC_inverse_pred_gain_Q24_neon( /* O Returns inverse prediction gain in energy domain, Q30 */
+ const opus_int32 *A_Q24, /* I Prediction coefficients [order] */
+ const opus_int order /* I Prediction order */
+)
+{
+ opus_int32 Atmp_QA[ 2 ][ SILK_MAX_ORDER_LPC ];
+
+ /* Increase Q domain of the AR coefficients */
+ silk_assert(!(order & 1)); // order is even
+ silk_assert(SILK_MAX_ORDER_LPC == 16);
+ silk_assert(QA == 24); // No shift.
+ vst1q_s32(Atmp_QA[ 0 ], vld1q_s32(A_Q24));
+ vst1q_s32(Atmp_QA[ 0 ] + 4, vld1q_s32(A_Q24 + 4));
+ vst1q_s32(Atmp_QA[ 0 ] + 8, vld1q_s32(A_Q24 + 8));
+ vst1q_s32(Atmp_QA[ 0 ] + 12, vld1q_s32(A_Q24 + 12));
+
+ return LPC_inverse_pred_gain_QA_neon( Atmp_QA, order );
+}
+
+#endif
diff --git a/silk/arm/arm_silk_map.c b/silk/arm/arm_silk_map.c
index 2e330c4..59ceb6e 100644
--- a/silk/arm/arm_silk_map.c
+++ b/silk/arm/arm_silk_map.c
@@ -30,11 +30,21 @@ POSSIBILITY OF SUCH DAMAGE.
#include "main_FIX.h"
#include "NSQ.h"
+#include "SigProc_FIX.h"
#if defined(OPUS_HAVE_RTCD)
-# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && \
- !defined(OPUS_ARM_PRESUME_NEON_INTR))
+# if (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+
+opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_IMPL[OPUS_ARCHMASK + 1])( /* O Returns inverse prediction gain in energy domain, Q30 */
+ const opus_int16 *A_Q12, /* I Prediction coefficients, Q12 [order] */
+ const opus_int order /* I Prediction order */
+) = {
+ silk_LPC_inverse_pred_gain_c, /* ARMv4 */
+ silk_LPC_inverse_pred_gain_c, /* EDSP */
+ silk_LPC_inverse_pred_gain_c, /* Media */
+ MAY_HAVE_NEON(silk_LPC_inverse_pred_gain), /* Neon */
+};
/*There is no table for silk_noise_shape_quantizer_short_prediction because the
NEON version takes different parameters than the C version.
@@ -56,6 +66,16 @@ opus_int32
#if defined(FIXED_POINT) && \
defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+opus_int32 (*const SILK_LPC_INVERSE_PRED_GAIN_Q24_IMPL[OPUS_ARCHMASK + 1])( /* O Returns inverse prediction gain in energy domain, Q30 */
+ const opus_int32 *A_Q24, /* I Prediction coefficients [order] */
+ const opus_int order /* I Prediction order */
+) = {
+ silk_LPC_inverse_pred_gain_Q24_c, /* ARMv4 */
+ silk_LPC_inverse_pred_gain_Q24_c, /* EDSP */
+ silk_LPC_inverse_pred_gain_Q24_c, /* Media */
+ MAY_HAVE_NEON(silk_LPC_inverse_pred_gain_Q24), /* Neon */
+};
+
void (*const SILK_WARPED_AUTOCORRELATION_FIX_IMPL[OPUS_ARCHMASK + 1])(
opus_int32 *corr, /* O Result [order + 1] */
opus_int *scale, /* O Scaling of the correlation vector */
diff --git a/silk/decode_parameters.c b/silk/decode_parameters.c
index e345b1d..a56a409 100644
--- a/silk/decode_parameters.c
+++ b/silk/decode_parameters.c
@@ -52,7 +52,7 @@ void silk_decode_parameters(
silk_NLSF_decode( pNLSF_Q15, psDec->indices.NLSFIndices, psDec->psNLSF_CB );
/* Convert NLSF parameters to AR prediction filter coefficients */
- silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order );
+ silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 1 ], pNLSF_Q15, psDec->LPC_order, psDec->arch );
/* If just reset, e.g., because internal Fs changed, do not allow interpolation */
/* improves the case of packet loss in the first frame after a switch */
@@ -69,7 +69,7 @@ void silk_decode_parameters(
}
/* Convert NLSF parameters to AR prediction filter coefficients */
- silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order );
+ silk_NLSF2A( psDecCtrl->PredCoef_Q12[ 0 ], pNLSF0_Q15, psDec->LPC_order, psDec->arch );
} else {
/* Copy LPC coefficients for first half from second half */
silk_memcpy( psDecCtrl->PredCoef_Q12[ 0 ], psDecCtrl->PredCoef_Q12[ 1 ], psDec->LPC_order * sizeof( opus_int16 ) );
diff --git a/silk/fixed/find_LPC_FIX.c b/silk/fixed/find_LPC_FIX.c
index e11cdc8..e55b63a 100644
--- a/silk/fixed/find_LPC_FIX.c
+++ b/silk/fixed/find_LPC_FIX.c
@@ -92,7 +92,7 @@ void silk_find_LPC_FIX(
silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder );
/* Convert to LPC for residual energy evaluation */
- silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder );
+ silk_NLSF2A( a_tmp_Q12, NLSF0_Q15, psEncC->predictLPCOrder, psEncC->arch );
/* Calculate residual energy with NLSF interpolation */
silk_LPC_analysis_filter( LPC_res, x, a_tmp_Q12, 2 * subfr_length, psEncC->predictLPCOrder, psEncC->arch );
diff --git a/silk/float/find_LPC_FLP.c b/silk/float/find_LPC_FLP.c
index fcfe1c3..4d63964 100644
--- a/silk/float/find_LPC_FLP.c
+++ b/silk/float/find_LPC_FLP.c
@@ -73,7 +73,7 @@ void silk_find_LPC_FLP(
silk_interpolate( NLSF0_Q15, psEncC->prev_NLSFq_Q15, NLSF_Q15, k, psEncC->predictLPCOrder );
/* Convert to LPC for residual energy evaluation */
- silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder );
+ silk_NLSF2A_FLP( a_tmp, NLSF0_Q15, psEncC->predictLPCOrder, psEncC->arch );
/* Calculate residual energy with LSF interpolation */
silk_LPC_analysis_filter_FLP( LPC_res, a_tmp, x, 2 * subfr_length, psEncC->predictLPCOrder );
diff --git a/silk/float/main_FLP.h b/silk/float/main_FLP.h
index e5a7597..c2105a5 100644
--- a/silk/float/main_FLP.h
+++ b/silk/float/main_FLP.h
@@ -285,7 +285,8 @@ void silk_A2NLSF_FLP(
void silk_NLSF2A_FLP(
silk_float *pAR, /* O LPC coefficients [ LPC_order ] */
const opus_int16 *NLSF_Q15, /* I NLSF vector [ LPC_order ] */
- const opus_int LPC_order /* I LPC order */
+ const opus_int LPC_order, /* I LPC order */
+ int arch /* I Run-time architecture */
);
/* Limit, stabilize, and quantize NLSFs */
diff --git a/silk/float/wrappers_FLP.c b/silk/float/wrappers_FLP.c
index 6666b8e..53a556e 100644
--- a/silk/float/wrappers_FLP.c
+++ b/silk/float/wrappers_FLP.c
@@ -54,13 +54,14 @@ void silk_A2NLSF_FLP(
void silk_NLSF2A_FLP(
silk_float *pAR, /* O LPC coefficients [ LPC_order ] */
const opus_int16 *NLSF_Q15, /* I NLSF vector [ LPC_order ] */
- const opus_int LPC_order /* I LPC order */
+ const opus_int LPC_order, /* I LPC order */
+ int arch /* I Run-time architecture */
)
{
opus_int i;
opus_int16 a_fix_Q12[ MAX_LPC_ORDER ];
- silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order );
+ silk_NLSF2A( a_fix_Q12, NLSF_Q15, LPC_order, arch );
for( i = 0; i < LPC_order; i++ ) {
pAR[ i ] = ( silk_float )a_fix_Q12[ i ] * ( 1.0f / 4096.0f );
diff --git a/silk/init_decoder.c b/silk/init_decoder.c
index f887c67..16c03dc 100644
--- a/silk/init_decoder.c
+++ b/silk/init_decoder.c
@@ -44,6 +44,7 @@ opus_int silk_init_decoder(
/* Used to deactivate LSF interpolation */
psDec->first_frame_after_reset = 1;
psDec->prev_gain_Q16 = 65536;
+ psDec->arch = opus_select_arch();
/* Reset CNG state */
silk_CNG_Reset( psDec );
diff --git a/silk/process_NLSFs.c b/silk/process_NLSFs.c
index 0ab71f0..2f10f8d 100644
--- a/silk/process_NLSFs.c
+++ b/silk/process_NLSFs.c
@@ -89,7 +89,7 @@ void silk_process_NLSFs(
NLSF_mu_Q20, psEncC->NLSF_MSVQ_Survivors, psEncC->indices.signalType );
/* Convert quantized NLSFs back to LPC coefficients */
- silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder );
+ silk_NLSF2A( PredCoef_Q12[ 1 ], pNLSF_Q15, psEncC->predictLPCOrder, psEncC->arch );
if( doInterpolate ) {
/* Calculate the interpolated, quantized LSF vector for the first half */
@@ -97,7 +97,7 @@ void silk_process_NLSFs(
psEncC->indices.NLSFInterpCoef_Q2, psEncC->predictLPCOrder );
/* Convert back to LPC coefficients */
- silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder );
+ silk_NLSF2A( PredCoef_Q12[ 0 ], pNLSF0_temp_Q15, psEncC->predictLPCOrder, psEncC->arch );
} else {
/* Copy LPC coefficients for first half from second half */
diff --git a/silk/structs.h b/silk/structs.h
index 827829d..b68e4c9 100644
--- a/silk/structs.h
+++ b/silk/structs.h
@@ -301,6 +301,7 @@ typedef struct {
/* Stuff used for PLC */
opus_int lossCnt;
opus_int prevSignalType;
+ int arch;
silk_PLC_struct sPLC;
diff --git a/silk/tests/test_unit_optimization_LPC_inv_pred_gain.c b/silk/tests/test_unit_optimization_LPC_inv_pred_gain.c
new file mode 100644
index 0000000..e98f3f6
--- /dev/null
+++ b/silk/tests/test_unit_optimization_LPC_inv_pred_gain.c
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "SigProc_FIX.h"
+
+static int test_silk_LPC_inverse_pred_gain(int arch)
+{
+ unsigned int i;
+ opus_int order;
+ opus_int16 A_Q12[ SILK_MAX_ORDER_LPC ];
+ opus_int32 rtn_org, rtn_opt;
+ (void)arch;
+
+ printf("%50s", "LPC_inverse_pred_gain() ...");
+ for( order = 2; order <= SILK_MAX_ORDER_LPC; order += 2 ) // order must be even.
+ {
+ for (unsigned int shift = 0; shift < 16; shift++) // Test dynamic range.
+ {
+ for (i = 0; i < SILK_MAX_ORDER_LPC; i++)
+ {
+ A_Q12[i] = ((opus_int16)rand()) >> shift;
+ }
+
+ rtn_org = silk_LPC_inverse_pred_gain_c(A_Q12, order);
+ rtn_opt = silk_LPC_inverse_pred_gain (A_Q12, order);
+ if ((rtn_org != rtn_opt))
+ {
+ printf("order=%2d failed!\n", order);
+ printf("rtn_org=%d rtn_opt=%d!\n", rtn_org, rtn_opt);
+ return -1;
+ }
+ }
+ }
+ printf(" passed!\n");
+ return 0;
+}
+
+#ifdef FIXED_POINT
+
+static int test_silk_LPC_inverse_pred_gain_Q24(int arch)
+{
+ unsigned int i;
+ opus_int order;
+ opus_int32 A_Q24[ SILK_MAX_ORDER_LPC ];
+ opus_int32 rtn_org, rtn_opt;
+ (void)arch;
+
+ printf("%50s", "LPC_inverse_pred_gain_Q24() ...");
+ for( order = 2; order <= SILK_MAX_ORDER_LPC; order += 2 ) // order must be even.
+ {
+ for (unsigned int shift = 0; shift < 31; shift++) // Test dynamic range.
+ {
+ for (i = 0; i < SILK_MAX_ORDER_LPC; i++)
+ {
+ A_Q24[i] = ((opus_int32)rand()) >> shift;
+ }
+
+ rtn_org = silk_LPC_inverse_pred_gain_Q24_c(A_Q24, order);
+ rtn_opt = silk_LPC_inverse_pred_gain_Q24 (A_Q24, order);
+ if ((rtn_org != rtn_opt))
+ {
+ printf("order=%2d failed!\n", order);
+ printf("rtn_org=%d rtn_opt=%d!\n", rtn_org, rtn_opt);
+ return -1;
+ }
+ }
+ }
+ printf(" passed!\n");
+ return 0;
+}
+
+#endif /* FIXED_POINT */
diff --git a/silk_headers.mk b/silk_headers.mk
index 52c42d0..ca9bf27 100644
--- a/silk_headers.mk
+++ b/silk_headers.mk
@@ -22,6 +22,7 @@ silk/resampler_rom.h \
silk/resampler_structs.h \
silk/SigProc_FIX.h \
silk/x86/SigProc_FIX_sse.h \
+silk/arm/LPC_inv_pred_gain_arm.h \
silk/arm/macros_armv4.h \
silk/arm/macros_armv5e.h \
silk/arm/macros_arm64.h \
diff --git a/silk_sources.mk b/silk_sources.mk
index 5f9551b..d8323df 100644
--- a/silk_sources.mk
+++ b/silk_sources.mk
@@ -84,6 +84,7 @@ silk/x86/VQ_WMat_EC_sse.c
SILK_SOURCES_ARM_NEON_INTR = \
silk/arm/arm_silk_map.c \
+silk/arm/LPC_inv_pred_gain_neon_intr.c \
silk/arm/NSQ_neon.c
SILK_SOURCES_FIXED = \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
index b5c25d9..8e90074 100644
--- a/tests/test_unit_optimization.c
+++ b/tests/test_unit_optimization.c
@@ -29,6 +29,7 @@
#endif
#include <stdio.h>
+#include "cpu_support.h"
#include "stack_alloc.h"
#define SKIP_CONFIG_H
@@ -44,21 +45,25 @@
#endif
+# include "silk/tests/test_unit_optimization_LPC_inv_pred_gain.c"
+
int main(void)
{
int result = 0; /* 0: passed; other: failed */
ALLOC_STACK;
-#ifdef FIXED_POINT
int arch = opus_select_arch();
-#endif /* FIXED_POINT */
int count = 10;
+ srand(0);
+
while (!result && count--) {
printf("\n--------------------------- Testing optimization ---------------------------\n");
#ifdef FIXED_POINT
result |= test_fir(arch);
+ result |= test_silk_LPC_inverse_pred_gain_Q24(arch);
result |= test_warped_autocorrelation(arch);
#endif /* FIXED_POINT */
+ result |= test_silk_LPC_inverse_pred_gain(arch);
}
return result;
}
--
2.8.0.rc3.226.g39d4020
More information about the opus
mailing list