[opus] [PATCH 14/15] Optimize celt_inner_prod() and dual_inner_prod() for ARM NEON
Linfeng Zhang
linfengz at google.com
Tue Sep 13 00:03:56 UTC 2016
Created corresponding unit test.
The fixed-point optimizations are bit exact with C functions.
The floating-point optimizations are not bit exact with C functions, because of
the order changes of floating-point operations. But they are bit exact with the
simulation C functions which stimulate the floating operations in the
optimizations.
---
celt/arm/arm_celt_map.c | 17 ++
celt/arm/pitch_arm.h | 36 ++++
celt/arm/pitch_neon_intr.c | 179 ++++++++++++++++++++
celt/pitch.h | 3 +-
celt/tests/test_unit_dft.c | 1 +
celt/tests/test_unit_mathops.c | 1 +
celt/tests/test_unit_mdct.c | 1 +
celt/tests/test_unit_optimization_pitch.c | 263 ++++++++++++++++++++++++++++++
celt/tests/test_unit_rotation.c | 1 +
celt/x86/pitch_sse.h | 5 +-
celt_sources.mk | 3 +-
tests/test_unit_optimization.c | 4 +
12 files changed, 508 insertions(+), 6 deletions(-)
create mode 100644 celt/arm/pitch_neon_intr.c
create mode 100644 celt/tests/test_unit_optimization_pitch.c
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 6e28c70..a1a553a 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -36,6 +36,23 @@
#if defined(OPUS_HAVE_RTCD)
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+ celt_inner_prod_c, /* ARMv4 */
+ celt_inner_prod_c, /* EDSP */
+ celt_inner_prod_c, /* Media */
+ MAY_HAVE_NEON(celt_inner_prod) /* NEON */
+};
+
+void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+ int N, opus_val32 *xy1, opus_val32 *xy2) = {
+ dual_inner_prod_c, /* ARMv4 */
+ dual_inner_prod_c, /* EDSP */
+ dual_inner_prod_c, /* Media */
+ MAY_HAVE_NEON(dual_inner_prod) /* NEON */
+};
+# endif
+
# if defined(FIXED_POINT)
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index d8b022e..d1a8db0 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -30,6 +30,42 @@
# include "armcpu.h"
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N);
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,
+ const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
+# endif
+
+# if !defined(OPUS_HAVE_RTCD)
+# define OVERRIDE_CELT_INNER_PROD (1)
+# define OVERRIDE_DUAL_INNER_PROD (1)
+# define celt_inner_prod(x, y, N, arch) ((void)(arch), PRESUME_NEON(celt_inner_prod)(x, y, N))
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), PRESUME_NEON(dual_inner_prod)(x, y01, y02, N, xy1, xy2))
+# endif
+
+# if !defined(OVERRIDE_CELT_INNER_PROD)
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N);
+# define OVERRIDE_CELT_INNER_PROD (1)
+# define celt_inner_prod(x, y, N, arch) ((*CELT_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y, N))
+# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+# define OVERRIDE_CELT_INNER_PROD (1)
+# define celt_inner_prod(x, y, N, arch) ((void)(arch), celt_inner_prod_neon(x, y, N))
+# endif
+# endif
+
+# if !defined(OVERRIDE_DUAL_INNER_PROD)
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
+ const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
+# define OVERRIDE_DUAL_INNER_PROD (1)
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+# define OVERRIDE_DUAL_INNER_PROD (1)
+# define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_neon(x, y01, y02, N, xy1, xy2))
+# endif
+# endif
+
# if defined(FIXED_POINT)
# if defined(OPUS_ARM_MAY_HAVE_NEON)
diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c
new file mode 100644
index 0000000..2bda6e1
--- /dev/null
+++ b/celt/arm/pitch_neon_intr.c
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+ @file pitch_neon_intr.c
+ @brief ARM Neon Intrinsic optimizations for celt pitch functions
+ */
+
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "pitch.h"
+
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
+{
+ int i;
+ opus_val32 xy;
+
+#ifdef FIXED_POINT
+ int16x8_t x_s16x8, y_s16x8;
+ int32x4_t xy_s32x4 = vdupq_n_s32(0);
+ int64x2_t xy_s64x2;
+ int64x1_t xy_s64x1;
+
+ for (i = 0; i < N - 7; i += 8) {
+ x_s16x8 = vld1q_s16(&x[i]);
+ y_s16x8 = vld1q_s16(&y[i]);
+ xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
+ xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
+ }
+
+ if (N - i >= 4) {
+ const int16x4_t x_s16x4 = vld1_s16(&x[i]);
+ const int16x4_t y_s16x4 = vld1_s16(&y[i]);
+ xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
+ i += 4;
+ }
+
+ xy_s64x2 = vpaddlq_s32(xy_s32x4);
+ xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
+ xy = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
+#else
+ float32x4_t xy_f32x4 = vdupq_n_f32(0);
+ float32x2_t xy_f32x2;
+
+ for (i = 0; i < N - 7; i += 8) {
+ float32x4_t x_f32x4, y_f32x4;
+ x_f32x4 = vld1q_f32(&x[i]);
+ y_f32x4 = vld1q_f32(&y[i]);
+ xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+ x_f32x4 = vld1q_f32(&x[i + 4]);
+ y_f32x4 = vld1q_f32(&y[i + 4]);
+ xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+ }
+
+ if (N - i >= 4) {
+ const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
+ const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
+ xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+ i += 4;
+ }
+
+ xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
+ xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
+ xy = vget_lane_f32(xy_f32x2, 0);
+#endif
+
+ for (; i < N; i++) {
+ xy = MAC16_16(xy, x[i], y[i]);
+ }
+ return xy;
+}
+
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+ int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+ int i;
+ opus_val32 xy01, xy02;
+
+#ifdef FIXED_POINT
+ int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
+ int32x4_t xy01_s32x4 = vdupq_n_s32(0);
+ int32x4_t xy02_s32x4 = vdupq_n_s32(0);
+ int64x2_t xy01_s64x2, xy02_s64x2;
+ int64x1_t xy01_s64x1, xy02_s64x1;
+
+ for (i = 0; i < N - 7; i += 8) {
+ x_s16x8 = vld1q_s16(&x[i]);
+ y01_s16x8 = vld1q_s16(&y01[i]);
+ y02_s16x8 = vld1q_s16(&y02[i]);
+ xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
+ xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
+ xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
+ xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
+ }
+
+ if (N - i >= 4) {
+ const int16x4_t x_s16x4 = vld1_s16(&x[i]);
+ const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
+ const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
+ xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
+ xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
+ i += 4;
+ }
+
+ xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
+ xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
+ xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
+ xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
+ xy01 = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
+ xy02 = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
+#else
+ float32x4_t xy01_f32x4 = vdupq_n_f32(0);
+ float32x4_t xy02_f32x4 = vdupq_n_f32(0);
+ float32x2_t xy01_f32x2, xy02_f32x2;
+
+ for (i = 0; i < N - 7; i += 8) {
+ float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
+ x_f32x4 = vld1q_f32(&x[i]);
+ y01_f32x4 = vld1q_f32(&y01[i]);
+ y02_f32x4 = vld1q_f32(&y02[i]);
+ xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+ xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+ x_f32x4 = vld1q_f32(&x[i + 4]);
+ y01_f32x4 = vld1q_f32(&y01[i + 4]);
+ y02_f32x4 = vld1q_f32(&y02[i + 4]);
+ xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+ xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+ }
+
+ if (N - i >= 4) {
+ const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
+ const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
+ const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
+ xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+ xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+ i += 4;
+ }
+
+ xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
+ xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
+ xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
+ xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
+ xy01 = vget_lane_f32(xy01_f32x2, 0);
+ xy02 = vget_lane_f32(xy02_f32x2, 0);
+#endif
+
+ for (; i < N; i++) {
+ xy01 = MAC16_16(xy01, x[i], y01[i]);
+ xy02 = MAC16_16(xy02, x[i], y02[i]);
+ }
+ *xy1 = xy01;
+ *xy2 = xy02;
+}
diff --git a/celt/pitch.h b/celt/pitch.h
index d797844..e425f56 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -46,8 +46,7 @@
#include "mips/pitch_mipsr1.h"
#endif
-#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
- || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
# include "arm/pitch_arm.h"
#endif
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 582618e..02904bf 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -54,6 +54,7 @@
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# include "arm/celt_lpc_neon_intr.c"
# include "arm/celt_neon_intr.c"
+# include "arm/pitch_neon_intr.c"
# if defined(HAVE_ARM_NE10)
# include "mdct.c"
# include "arm/celt_ne10_fft.c"
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index f5af994..524c1f8 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -69,6 +69,7 @@
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# include "arm/celt_lpc_neon_intr.c"
# include "arm/celt_neon_intr.c"
+# include "arm/pitch_neon_intr.c"
# if defined(HAVE_ARM_NE10)
# include "kiss_fft.c"
# include "mdct.c"
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 0658c7a..3b28767 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -55,6 +55,7 @@
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# include "arm/celt_lpc_neon_intr.c"
# include "arm/celt_neon_intr.c"
+# include "arm/pitch_neon_intr.c"
# if defined(HAVE_ARM_NE10)
# include "arm/celt_ne10_fft.c"
# include "arm/celt_ne10_mdct.c"
diff --git a/celt/tests/test_unit_optimization_pitch.c b/celt/tests/test_unit_optimization_pitch.c
new file mode 100644
index 0000000..64bb2a9
--- /dev/null
+++ b/celt/tests/test_unit_optimization_pitch.c
@@ -0,0 +1,263 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+#include <string.h>
+
+#include "modes.h"
+#include "pitch.h"
+
+#define MAX_LEN_INNER_PROD 960
+
+#ifndef UNIT_TEST_CELT_INNER_PROD
+#define UNIT_TEST_CELT_INNER_PROD
+
+static inline float rand_float(float min, float max)
+{
+ return ((max - min) * ((float)rand() / RAND_MAX)) + min;
+}
+
+static OPUS_INLINE opus_val16 rand_val16(opus_val16 min, opus_val16 max)
+{
+#ifdef FIXED_POINT
+ (void)min;
+ (void)max;
+ return rand();
+#else
+ return rand_float(min, max);
+#endif
+}
+
+static OPUS_INLINE void init_val16_buffer(opus_val16* buffer, int num)
+{
+ const opus_val16 min = (opus_val16)-1e10;
+ const opus_val16 max = (opus_val16) 1e10;
+
+ for (int i = 0; i < num; i++) {
+ buffer[i] = rand_val16(min, max);
+ }
+}
+
+#endif
+
+/* ========================================================================== */
+/* This part of code simulates floating-point operations. */
+
+#ifndef FIXED_POINT
+
+/* celt_inner_prod_float_simulation_sse() simulates the floating operations of
+ * celt_inner_prod_sse(), and both functions should have bit exact output.
+ */
+opus_val32 celt_inner_prod_float_simulation_sse(const opus_val16 *x,
+ const opus_val16 *y, int N)
+{
+ int i;
+ opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
+ for (i = 0; i < N - 3; i += 4) {
+ xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
+ xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
+ xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
+ xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+ }
+ xy0 += xy2;
+ xy1 += xy3;
+ xy = xy0 + xy1;
+ for (; i < N; i++) {
+ xy = MAC16_16(xy, x[i], y[i]);
+ }
+ return xy;
+}
+
+/* dual_inner_prod_float_simulation_sse() simulates the floating-point operations
+ * of dual_inner_prod_sse(), and both functions should have bit exact output.
+ */
+void dual_inner_prod_float_simulation_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+ int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+ int i;
+ opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
+ for (i = 0; i < N - 3; i += 4) {
+ xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
+ xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
+ xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
+ xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
+ xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
+ xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
+ xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
+ xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
+ }
+ xy01_0 += xy01_2;
+ xy02_0 += xy02_2;
+ xy01_1 += xy01_3;
+ xy02_1 += xy02_3;
+ xy01 = xy01_0 + xy01_1;
+ xy02 = xy02_0 + xy02_1;
+ for (; i < N; i++) {
+ xy01 = MAC16_16(xy01, x[i], y01[i]);
+ xy02 = MAC16_16(xy02, x[i], y02[i]);
+ }
+ *xy1 = xy01;
+ *xy2 = xy02;
+}
+
+# define celt_inner_prod_float_simulation_c celt_inner_prod_c
+# define dual_inner_prod_float_simulation_c dual_inner_prod_c
+
+/* Reuse since NEON optimizations happen to have the same simulated floating-point operations as SSE optimization. */
+# define celt_inner_prod_float_simulation_neon celt_inner_prod_float_simulation_sse
+# define dual_inner_prod_float_simulation_neon dual_inner_prod_float_simulation_sse
+
+# ifdef OPUS_X86_MAY_HAVE_SSE
+# define OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION (1)
+# define OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION (1)
+# ifdef OPUS_X86_PRESUME_SSE
+# define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch), celt_inner_prod_float_simulation_sse(x, y, N))
+# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_float_simulation_sse(x, y01, y02, N, xy1, xy2))
+# else
+# define celt_inner_prod_float_simulation(x, y, N, arch) ((*CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
+# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+opus_val32 (*const CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+ celt_inner_prod_float_simulation_c, /* non-sse */
+ MAY_HAVE_SSE(celt_inner_prod_float_simulation),
+ MAY_HAVE_SSE(celt_inner_prod_float_simulation),
+ MAY_HAVE_SSE(celt_inner_prod_float_simulation),
+ MAY_HAVE_SSE(celt_inner_prod_float_simulation)
+};
+void (*const DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) = {
+ dual_inner_prod_float_simulation_c, /* non-sse */
+ MAY_HAVE_SSE(dual_inner_prod_float_simulation),
+ MAY_HAVE_SSE(dual_inner_prod_float_simulation),
+ MAY_HAVE_SSE(dual_inner_prod_float_simulation),
+ MAY_HAVE_SSE(dual_inner_prod_float_simulation)
+};
+# endif /* !defined(OPUS_X86_PRESUME_SSE) */
+# endif /* OPUS_X86_MAY_HAVE_SSE */
+
+# ifdef OPUS_ARM_MAY_HAVE_NEON_INTR
+# define OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION (1)
+# define OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION (1)
+# ifndef OPUS_HAVE_RTCD
+# define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch), PRESUME_NEON(celt_inner_prod_float_simulation)(x, y, N))
+# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), PRESUME_NEON(dual_inner_prod_float_simulation)(x, y01, y02, N, xy1, xy2))
+# else
+# ifdef OPUS_ARM_PRESUME_NEON_INTR
+# define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch), celt_inner_prod_float_simulation_neon(x, y, N))
+# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_float_simulation_neon(x, y01, y02, N, xy1, xy2))
+# else
+# define celt_inner_prod_float_simulation(x, y, N, arch) ((*CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
+# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+opus_val32 (*const CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+ celt_inner_prod_float_simulation_c, /* ARMv4 */
+ celt_inner_prod_float_simulation_c, /* EDSP */
+ celt_inner_prod_float_simulation_c, /* Media */
+ MAY_HAVE_NEON(celt_inner_prod_float_simulation) /* NEON */
+};
+void (*const DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) = {
+ dual_inner_prod_float_simulation_c, /* ARMv4 */
+ dual_inner_prod_float_simulation_c, /* EDSP */
+ dual_inner_prod_float_simulation_c, /* Media */
+ MAY_HAVE_NEON(dual_inner_prod_float_simulation) /* NEON */
+};
+# endif /* !defined(OPUS_ARM_PRESUME_NEON_INTR) */
+# endif /* OPUS_HAVE_RTCD */
+# endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */
+
+# ifndef OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION
+# define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch),celt_inner_prod_float_simulation_c(x, y, N))
+# endif
+
+# ifndef OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION
+# define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((void)(arch),dual_inner_prod_float_simulation_c(x, y01, y02, N, xy1, xy2))
+# endif
+
+#endif /* !FIXED_POINT */
+
+/* ========================================================================== */
+
+static int test_celt_inner_prod(int arch)
+{
+ opus_val16 x[MAX_LEN_INNER_PROD], y[MAX_LEN_INNER_PROD];
+ opus_val32 xy_org, xy_opt;
+ int N;
+
+ printf("%44s() ...", __func__);
+ init_val16_buffer(x, MAX_LEN_INNER_PROD);
+ init_val16_buffer(y, MAX_LEN_INNER_PROD);
+ for (N = 0; N <= MAX_LEN_INNER_PROD; N++) {
+#ifdef FIXED_POINT
+ xy_org = celt_inner_prod_c(x, y, N);
+#else
+ xy_org = celt_inner_prod_float_simulation(x, y, N, arch);
+#endif
+ xy_opt = celt_inner_prod(x, y, N, arch);
+ if (xy_org != xy_opt) {
+#ifdef FIXED_POINT
+ printf("\nN=%d xy_org = %d, xy_opt = %d failed!", N, xy_org, xy_opt);
+#else
+ printf("\nN=%d xy_org = %f, xy_opt = %f failed!", N, xy_org, xy_opt);
+#endif
+ return -1;
+ }
+ }
+ printf(" passed!\n");
+ return 0;
+}
+
+static int test_dual_inner_prod(int arch)
+{
+ opus_val16 x[MAX_LEN_INNER_PROD], y01[MAX_LEN_INNER_PROD], y02[MAX_LEN_INNER_PROD];
+ opus_val32 xy1_org, xy1_opt, xy2_org, xy2_opt;
+ int N;
+
+ printf("%44s() ...", __func__);
+ init_val16_buffer(x, MAX_LEN_INNER_PROD);
+ init_val16_buffer(y01, MAX_LEN_INNER_PROD);
+ init_val16_buffer(y02, MAX_LEN_INNER_PROD);
+ for (N = 0; N <= MAX_LEN_INNER_PROD; N++) {
+#ifdef FIXED_POINT
+ dual_inner_prod_c(x, y01, y02, N, &xy1_org, &xy2_org);
+#else
+ dual_inner_prod_float_simulation(x, y01, y02, N, &xy1_org, &xy2_org, arch);
+#endif
+ dual_inner_prod(x, y01, y02, N, &xy1_opt, &xy2_opt, arch);
+ if ((xy1_org != xy1_opt) || (xy2_org != xy2_opt)) {
+#ifdef FIXED_POINT
+ printf("\nN=%d xy1_org = %d, xy1_opt = %d failed!", N, xy1_org, xy1_opt);
+ printf("\nN=%d xy2_org = %d, xy2_opt = %d failed!", N, xy2_org, xy2_opt);
+#else
+ printf("\nN=%d xy1_org = %f, xy1_opt = %f failed!", N, xy1_org, xy1_opt);
+ printf("\nN=%d xy2_org = %f, xy2_opt = %f failed!", N, xy2_org, xy2_opt);
+#endif
+ return -1;
+ }
+ }
+ printf(" passed!\n");
+ return 0;
+}
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index 785b40d..0b73839 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -67,6 +67,7 @@
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# include "arm/celt_lpc_neon_intr.c"
# include "arm/celt_neon_intr.c"
+# include "arm/pitch_neon_intr.c"
# if defined(HAVE_ARM_NE10)
# include "kiss_fft.c"
# include "mdct.c"
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index e5f87ab..5e85599 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -91,7 +91,7 @@ opus_val32 celt_inner_prod_sse2(
int N);
#endif
-#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
opus_val32 celt_inner_prod_sse(
const opus_val16 *x,
const opus_val16 *y,
@@ -104,7 +104,7 @@ opus_val32 celt_inner_prod_sse(
#define celt_inner_prod(x, y, N, arch) \
((void)arch, celt_inner_prod_sse4_1(x, y, N))
-#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT)
#define OVERRIDE_CELT_INNER_PROD
#define celt_inner_prod(x, y, N, arch) \
((void)arch, celt_inner_prod_sse2(x, y, N))
@@ -114,7 +114,6 @@ opus_val32 celt_inner_prod_sse(
#define celt_inner_prod(x, y, N, arch) \
((void)arch, celt_inner_prod_sse(x, y, N))
-
#elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
(defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
diff --git a/celt_sources.mk b/celt_sources.mk
index c4bd285..dc107d9 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -38,7 +38,8 @@ celt/arm/armopts.s.in
CELT_SOURCES_ARM_NEON_INTR = \
celt/arm/celt_lpc_neon_intr.c \
-celt/arm/celt_neon_intr.c
+celt/arm/celt_neon_intr.c \
+celt/arm/pitch_neon_intr.c
CELT_SOURCES_ARM_NE10= \
celt/arm/celt_ne10_fft.c \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
index 6155dfb..a88ac21 100644
--- a/tests/test_unit_optimization.c
+++ b/tests/test_unit_optimization.c
@@ -71,6 +71,7 @@
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# include "celt/arm/celt_lpc_neon_intr.c"
# include "celt/arm/celt_neon_intr.c"
+# include "celt/arm/pitch_neon_intr.c"
# include "silk/arm/biquad_alt_neon_intr.c"
# include "silk/arm/inner_prod_aligned_neon_intr.c"
# include "silk/arm/LPC_analysis_filter_neon_intr.c"
@@ -94,6 +95,7 @@
#endif
+# include "celt/tests/test_unit_optimization_pitch.c"
# include "silk/tests/test_unit_optimization_biquad_alt.c"
# include "silk/tests/test_unit_optimization_inner_prod_aligned.c"
# include "silk/tests/test_unit_optimization_LPC_analysis_filter.c"
@@ -118,6 +120,8 @@ int main(void)
result |= test_silk_LPC_inverse_pred_gain_Q24(arch);
result |= test_warped_autocorrelation(arch);
#endif /* FIXED_POINT */
+ result |= test_celt_inner_prod(arch);
+ result |= test_dual_inner_prod(arch);
result |= test_silk_biquad_alt(arch);
result |= test_silk_inner_prod_aligned_scale(arch);
result |= test_silk_LPC_analysis_filter(arch);
--
2.8.0.rc3.226.g39d4020
More information about the opus
mailing list