[opus] [PATCH 14/15] Optimize celt_inner_prod() and dual_inner_prod() for ARM NEON

Tue Sep 13 00:03:56 UTC 2016

Created corresponding unit test.
The fixed-point optimizations are bit exact with C functions.
The floating-point optimizations are not bit exact with C functions, because of
the order changes of floating-point operations. But they are bit exact with the
simulation C functions which stimulate the floating operations in the
optimizations.
---
 celt/arm/arm_celt_map.c                   |  17 ++
 celt/arm/pitch_arm.h                      |  36 ++++
 celt/arm/pitch_neon_intr.c                | 179 ++++++++++++++++++++
 celt/pitch.h                              |   3 +-
 celt/tests/test_unit_dft.c                |   1 +
 celt/tests/test_unit_mathops.c            |   1 +
 celt/tests/test_unit_mdct.c               |   1 +
 celt/tests/test_unit_optimization_pitch.c | 263 ++++++++++++++++++++++++++++++
 celt/tests/test_unit_rotation.c           |   1 +
 celt/x86/pitch_sse.h                      |   5 +-
 celt_sources.mk                           |   3 +-
 tests/test_unit_optimization.c            |   4 +
 12 files changed, 508 insertions(+), 6 deletions(-)
 create mode 100644 celt/arm/pitch_neon_intr.c
 create mode 100644 celt/tests/test_unit_optimization_pitch.c

diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 6e28c70..a1a553a 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -36,6 +36,23 @@
 
 #if defined(OPUS_HAVE_RTCD)
 
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
+opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+  celt_inner_prod_c,             /* ARMv4 */
+  celt_inner_prod_c,             /* EDSP */
+  celt_inner_prod_c,             /* Media */
+  MAY_HAVE_NEON(celt_inner_prod) /* NEON */
+};
+
+void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+      int N, opus_val32 *xy1, opus_val32 *xy2) = {
+  dual_inner_prod_c,             /* ARMv4 */
+  dual_inner_prod_c,             /* EDSP */
+  dual_inner_prod_c,             /* Media */
+  MAY_HAVE_NEON(dual_inner_prod) /* NEON */
+};
+# endif
+
 # if defined(FIXED_POINT)
 #  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
 void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index d8b022e..d1a8db0 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -30,6 +30,42 @@
 
 # include "armcpu.h"
 
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N);
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01,
+        const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
+# endif
+
+# if !defined(OPUS_HAVE_RTCD)
+#  define OVERRIDE_CELT_INNER_PROD (1)
+#  define OVERRIDE_DUAL_INNER_PROD (1)
+#  define celt_inner_prod(x, y, N, arch) ((void)(arch), PRESUME_NEON(celt_inner_prod)(x, y, N))
+#  define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), PRESUME_NEON(dual_inner_prod)(x, y01, y02, N, xy1, xy2))
+# endif
+
+# if !defined(OVERRIDE_CELT_INNER_PROD)
+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N);
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch) ((*CELT_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y, N))
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_CELT_INNER_PROD (1)
+#   define celt_inner_prod(x, y, N, arch) ((void)(arch), celt_inner_prod_neon(x, y, N))
+#  endif
+# endif
+
+# if !defined(OVERRIDE_DUAL_INNER_PROD)
+#  if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
+extern void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x,
+        const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2);
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_IMPL[(arch)&OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+#  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
+#   define OVERRIDE_DUAL_INNER_PROD (1)
+#   define dual_inner_prod(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_neon(x, y01, y02, N, xy1, xy2))
+#  endif
+# endif
+
 # if defined(FIXED_POINT)
 
 #  if defined(OPUS_ARM_MAY_HAVE_NEON)
diff --git a/celt/arm/pitch_neon_intr.c b/celt/arm/pitch_neon_intr.c
new file mode 100644
index 0000000..2bda6e1
--- /dev/null
+++ b/celt/arm/pitch_neon_intr.c
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+   @file pitch_neon_intr.c
+   @brief ARM Neon Intrinsic optimizations for celt pitch functions
+ */
+
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "pitch.h"
+
+opus_val32 celt_inner_prod_neon(const opus_val16 *x, const opus_val16 *y, int N)
+{
+    int i;
+    opus_val32 xy;
+
+#ifdef FIXED_POINT
+    int16x8_t x_s16x8, y_s16x8;
+    int32x4_t xy_s32x4 = vdupq_n_s32(0);
+    int64x2_t xy_s64x2;
+    int64x1_t xy_s64x1;
+
+    for (i = 0; i < N - 7; i += 8) {
+        x_s16x8  = vld1q_s16(&x[i]);
+        y_s16x8  = vld1q_s16(&y[i]);
+        xy_s32x4 = vmlal_s16(xy_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y_s16x8));
+        xy_s32x4 = vmlal_s16(xy_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y_s16x8));
+    }
+
+    if (N - i >= 4) {
+        const int16x4_t x_s16x4 = vld1_s16(&x[i]);
+        const int16x4_t y_s16x4 = vld1_s16(&y[i]);
+        xy_s32x4 = vmlal_s16(xy_s32x4, x_s16x4, y_s16x4);
+        i += 4;
+    }
+
+    xy_s64x2 = vpaddlq_s32(xy_s32x4);
+    xy_s64x1 = vadd_s64(vget_low_s64(xy_s64x2), vget_high_s64(xy_s64x2));
+    xy       = vget_lane_s32(vreinterpret_s32_s64(xy_s64x1), 0);
+#else
+    float32x4_t xy_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy_f32x2;
+
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y_f32x4;
+        x_f32x4  = vld1q_f32(&x[i]);
+        y_f32x4  = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        x_f32x4  = vld1q_f32(&x[i + 4]);
+        y_f32x4  = vld1q_f32(&y[i + 4]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+    }
+
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4 = vld1q_f32(&x[i]);
+        const float32x4_t y_f32x4 = vld1q_f32(&y[i]);
+        xy_f32x4 = vmlaq_f32(xy_f32x4, x_f32x4, y_f32x4);
+        i += 4;
+    }
+
+    xy_f32x2 = vadd_f32(vget_low_f32(xy_f32x4), vget_high_f32(xy_f32x4));
+    xy_f32x2 = vpadd_f32(xy_f32x2, xy_f32x2);
+    xy       = vget_lane_f32(xy_f32x2, 0);
+#endif
+
+    for (; i < N; i++) {
+        xy = MAC16_16(xy, x[i], y[i]);
+    }
+    return xy;
+}
+
+void dual_inner_prod_neon(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+    int i;
+    opus_val32 xy01, xy02;
+
+#ifdef FIXED_POINT
+    int16x8_t x_s16x8, y01_s16x8, y02_s16x8;
+    int32x4_t xy01_s32x4 = vdupq_n_s32(0);
+    int32x4_t xy02_s32x4 = vdupq_n_s32(0);
+    int64x2_t xy01_s64x2, xy02_s64x2;
+    int64x1_t xy01_s64x1, xy02_s64x1;
+
+    for (i = 0; i < N - 7; i += 8) {
+        x_s16x8    = vld1q_s16(&x[i]);
+        y01_s16x8  = vld1q_s16(&y01[i]);
+        y02_s16x8  = vld1q_s16(&y02[i]);
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y01_s16x8));
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_low_s16 (x_s16x8), vget_low_s16 (y02_s16x8));
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y01_s16x8));
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, vget_high_s16(x_s16x8), vget_high_s16(y02_s16x8));
+    }
+
+    if (N - i >= 4) {
+        const int16x4_t x_s16x4   = vld1_s16(&x[i]);
+        const int16x4_t y01_s16x4 = vld1_s16(&y01[i]);
+        const int16x4_t y02_s16x4 = vld1_s16(&y02[i]);
+        xy01_s32x4 = vmlal_s16(xy01_s32x4, x_s16x4, y01_s16x4);
+        xy02_s32x4 = vmlal_s16(xy02_s32x4, x_s16x4, y02_s16x4);
+        i += 4;
+    }
+
+    xy01_s64x2 = vpaddlq_s32(xy01_s32x4);
+    xy02_s64x2 = vpaddlq_s32(xy02_s32x4);
+    xy01_s64x1 = vadd_s64(vget_low_s64(xy01_s64x2), vget_high_s64(xy01_s64x2));
+    xy02_s64x1 = vadd_s64(vget_low_s64(xy02_s64x2), vget_high_s64(xy02_s64x2));
+    xy01       = vget_lane_s32(vreinterpret_s32_s64(xy01_s64x1), 0);
+    xy02       = vget_lane_s32(vreinterpret_s32_s64(xy02_s64x1), 0);
+#else
+    float32x4_t xy01_f32x4 = vdupq_n_f32(0);
+    float32x4_t xy02_f32x4 = vdupq_n_f32(0);
+    float32x2_t xy01_f32x2, xy02_f32x2;
+
+    for (i = 0; i < N - 7; i += 8) {
+        float32x4_t x_f32x4, y01_f32x4, y02_f32x4;
+        x_f32x4    = vld1q_f32(&x[i]);
+        y01_f32x4  = vld1q_f32(&y01[i]);
+        y02_f32x4  = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        x_f32x4    = vld1q_f32(&x[i + 4]);
+        y01_f32x4  = vld1q_f32(&y01[i + 4]);
+        y02_f32x4  = vld1q_f32(&y02[i + 4]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+    }
+
+    if (N - i >= 4) {
+        const float32x4_t x_f32x4   = vld1q_f32(&x[i]);
+        const float32x4_t y01_f32x4 = vld1q_f32(&y01[i]);
+        const float32x4_t y02_f32x4 = vld1q_f32(&y02[i]);
+        xy01_f32x4 = vmlaq_f32(xy01_f32x4, x_f32x4, y01_f32x4);
+        xy02_f32x4 = vmlaq_f32(xy02_f32x4, x_f32x4, y02_f32x4);
+        i += 4;
+    }
+
+    xy01_f32x2 = vadd_f32(vget_low_f32(xy01_f32x4), vget_high_f32(xy01_f32x4));
+    xy02_f32x2 = vadd_f32(vget_low_f32(xy02_f32x4), vget_high_f32(xy02_f32x4));
+    xy01_f32x2 = vpadd_f32(xy01_f32x2, xy01_f32x2);
+    xy02_f32x2 = vpadd_f32(xy02_f32x2, xy02_f32x2);
+    xy01       = vget_lane_f32(xy01_f32x2, 0);
+    xy02       = vget_lane_f32(xy02_f32x2, 0);
+#endif
+
+    for (; i < N; i++) {
+      xy01 = MAC16_16(xy01, x[i], y01[i]);
+      xy02 = MAC16_16(xy02, x[i], y02[i]);
+   }
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
diff --git a/celt/pitch.h b/celt/pitch.h
index d797844..e425f56 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -46,8 +46,7 @@
 #include "mips/pitch_mipsr1.h"
 #endif
 
-#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
-  || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#if (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
 # include "arm/pitch_arm.h"
 #endif
 
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 582618e..02904bf 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -54,6 +54,7 @@
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
+#  include "arm/pitch_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "mdct.c"
 #   include "arm/celt_ne10_fft.c"
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index f5af994..524c1f8 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -69,6 +69,7 @@
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
+#  include "arm/pitch_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "kiss_fft.c"
 #   include "mdct.c"
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 0658c7a..3b28767 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -55,6 +55,7 @@
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
+#  include "arm/pitch_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "arm/celt_ne10_fft.c"
 #   include "arm/celt_ne10_mdct.c"
diff --git a/celt/tests/test_unit_optimization_pitch.c b/celt/tests/test_unit_optimization_pitch.c
new file mode 100644
index 0000000..64bb2a9
--- /dev/null
+++ b/celt/tests/test_unit_optimization_pitch.c
@@ -0,0 +1,263 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <stdio.h>
+#include <string.h>
+
+#include "modes.h"
+#include "pitch.h"
+
+#define MAX_LEN_INNER_PROD 960
+
+#ifndef UNIT_TEST_CELT_INNER_PROD
+#define UNIT_TEST_CELT_INNER_PROD
+
+static inline float rand_float(float min, float max)
+{
+    return ((max - min) * ((float)rand() / RAND_MAX)) + min;
+}
+
+static OPUS_INLINE opus_val16 rand_val16(opus_val16 min, opus_val16 max)
+{
+#ifdef FIXED_POINT
+    (void)min;
+    (void)max;
+    return rand();
+#else
+    return rand_float(min, max);
+#endif
+}
+
+static OPUS_INLINE void init_val16_buffer(opus_val16* buffer, int num)
+{
+    const opus_val16 min = (opus_val16)-1e10;
+    const opus_val16 max = (opus_val16) 1e10;
+
+    for (int i = 0; i < num; i++) {
+        buffer[i] = rand_val16(min, max);
+    }
+}
+
+#endif
+
+/* ========================================================================== */
+/* This part of code simulates floating-point operations. */
+
+#ifndef FIXED_POINT
+
+/* celt_inner_prod_float_simulation_sse() simulates the floating operations of
+ * celt_inner_prod_sse(), and both functions should have bit exact output.
+ */
+opus_val32 celt_inner_prod_float_simulation_sse(const opus_val16 *x,
+      const opus_val16 *y, int N)
+{
+   int i;
+   opus_val32 xy, xy0 = 0, xy1 = 0, xy2 = 0, xy3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy0 = MAC16_16(xy0, x[i + 0], y[i + 0]);
+      xy1 = MAC16_16(xy1, x[i + 1], y[i + 1]);
+      xy2 = MAC16_16(xy2, x[i + 2], y[i + 2]);
+      xy3 = MAC16_16(xy3, x[i + 3], y[i + 3]);
+   }
+   xy0 += xy2;
+   xy1 += xy3;
+   xy = xy0 + xy1;
+   for (; i < N; i++) {
+      xy = MAC16_16(xy, x[i], y[i]);
+   }
+   return xy;
+}
+
+/* dual_inner_prod_float_simulation_sse() simulates the floating-point operations
+ * of dual_inner_prod_sse(), and both functions should have bit exact output.
+ */
+void dual_inner_prod_float_simulation_sse(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02,
+        int N, opus_val32 *xy1, opus_val32 *xy2)
+{
+   int i;
+   opus_val32 xy01, xy02, xy01_0 = 0, xy01_1 = 0, xy01_2 = 0, xy01_3 = 0, xy02_0 = 0, xy02_1 = 0, xy02_2 = 0, xy02_3 = 0;
+   for (i = 0; i < N - 3; i += 4) {
+      xy01_0 = MAC16_16(xy01_0, x[i + 0], y01[i + 0]);
+      xy01_1 = MAC16_16(xy01_1, x[i + 1], y01[i + 1]);
+      xy01_2 = MAC16_16(xy01_2, x[i + 2], y01[i + 2]);
+      xy01_3 = MAC16_16(xy01_3, x[i + 3], y01[i + 3]);
+      xy02_0 = MAC16_16(xy02_0, x[i + 0], y02[i + 0]);
+      xy02_1 = MAC16_16(xy02_1, x[i + 1], y02[i + 1]);
+      xy02_2 = MAC16_16(xy02_2, x[i + 2], y02[i + 2]);
+      xy02_3 = MAC16_16(xy02_3, x[i + 3], y02[i + 3]);
+   }
+   xy01_0 += xy01_2;
+   xy02_0 += xy02_2;
+   xy01_1 += xy01_3;
+   xy02_1 += xy02_3;
+   xy01 = xy01_0 + xy01_1;
+   xy02 = xy02_0 + xy02_1;
+   for (; i < N; i++) {
+      xy01 = MAC16_16(xy01, x[i], y01[i]);
+      xy02 = MAC16_16(xy02, x[i], y02[i]);
+   }
+   *xy1 = xy01;
+   *xy2 = xy02;
+}
+
+# define celt_inner_prod_float_simulation_c celt_inner_prod_c
+# define dual_inner_prod_float_simulation_c dual_inner_prod_c
+
+/* Reuse since NEON optimizations happen to have the same simulated floating-point operations as SSE optimization. */
+# define celt_inner_prod_float_simulation_neon celt_inner_prod_float_simulation_sse
+# define dual_inner_prod_float_simulation_neon dual_inner_prod_float_simulation_sse
+
+# ifdef OPUS_X86_MAY_HAVE_SSE
+#  define OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION (1)
+#  define OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION (1)
+#  ifdef OPUS_X86_PRESUME_SSE
+#   define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch), celt_inner_prod_float_simulation_sse(x, y, N))
+#   define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_float_simulation_sse(x, y01, y02, N, xy1, xy2))
+#  else
+#   define celt_inner_prod_float_simulation(x, y, N, arch) ((*CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
+#   define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+opus_val32 (*const CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+        celt_inner_prod_float_simulation_c,                /* non-sse */
+        MAY_HAVE_SSE(celt_inner_prod_float_simulation),
+        MAY_HAVE_SSE(celt_inner_prod_float_simulation),
+        MAY_HAVE_SSE(celt_inner_prod_float_simulation),
+        MAY_HAVE_SSE(celt_inner_prod_float_simulation)
+};
+void (*const DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) = {
+        dual_inner_prod_float_simulation_c,                /* non-sse */
+        MAY_HAVE_SSE(dual_inner_prod_float_simulation),
+        MAY_HAVE_SSE(dual_inner_prod_float_simulation),
+        MAY_HAVE_SSE(dual_inner_prod_float_simulation),
+        MAY_HAVE_SSE(dual_inner_prod_float_simulation)
+};
+#  endif /* !defined(OPUS_X86_PRESUME_SSE) */
+# endif /* OPUS_X86_MAY_HAVE_SSE */
+
+# ifdef OPUS_ARM_MAY_HAVE_NEON_INTR
+#  define OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION (1)
+#  define OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION (1)
+#  ifndef OPUS_HAVE_RTCD
+#   define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch), PRESUME_NEON(celt_inner_prod_float_simulation)(x, y, N))
+#   define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), PRESUME_NEON(dual_inner_prod_float_simulation)(x, y01, y02, N, xy1, xy2))
+#  else
+#   ifdef OPUS_ARM_PRESUME_NEON_INTR
+#    define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch), celt_inner_prod_float_simulation_neon(x, y, N))
+#    define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((void)(arch), dual_inner_prod_float_simulation_neon(x, y01, y02, N, xy1, xy2))
+#   else
+#    define celt_inner_prod_float_simulation(x, y, N, arch) ((*CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y, N))
+#    define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((*DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[(arch) & OPUS_ARCHMASK])(x, y01, y02, N, xy1, xy2))
+opus_val32 (*const CELT_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const opus_val16 *x, const opus_val16 *y, int N) = {
+        celt_inner_prod_float_simulation_c,             /* ARMv4 */
+        celt_inner_prod_float_simulation_c,             /* EDSP */
+        celt_inner_prod_float_simulation_c,             /* Media */
+        MAY_HAVE_NEON(celt_inner_prod_float_simulation) /* NEON */
+};
+void (*const DUAL_INNER_PROD_FLOAT_SIMULATION_IMPL[OPUS_ARCHMASK + 1])(const opus_val16 *x, const opus_val16 *y01, const opus_val16 *y02, int N, opus_val32 *xy1, opus_val32 *xy2) = {
+        dual_inner_prod_float_simulation_c,             /* ARMv4 */
+        dual_inner_prod_float_simulation_c,             /* EDSP */
+        dual_inner_prod_float_simulation_c,             /* Media */
+        MAY_HAVE_NEON(dual_inner_prod_float_simulation) /* NEON */
+};
+#   endif /* !defined(OPUS_ARM_PRESUME_NEON_INTR) */
+#  endif /* OPUS_HAVE_RTCD */
+# endif /* OPUS_ARM_MAY_HAVE_NEON_INTR */
+
+# ifndef OVERRIDE_CELT_INNER_PROD_FLOAT_SIMULATION
+#  define celt_inner_prod_float_simulation(x, y, N, arch) ((void)(arch),celt_inner_prod_float_simulation_c(x, y, N))
+# endif
+
+# ifndef OVERRIDE_DUAL_INNER_PROD_FLOAT_SIMULATION
+#  define dual_inner_prod_float_simulation(x, y01, y02, N, xy1, xy2, arch) ((void)(arch),dual_inner_prod_float_simulation_c(x, y01, y02, N, xy1, xy2))
+# endif
+
+#endif /* !FIXED_POINT */
+
+/* ========================================================================== */
+
+static int test_celt_inner_prod(int arch)
+{
+    opus_val16 x[MAX_LEN_INNER_PROD], y[MAX_LEN_INNER_PROD];
+    opus_val32 xy_org, xy_opt;
+    int N;
+
+    printf("%44s() ...", __func__);
+    init_val16_buffer(x, MAX_LEN_INNER_PROD);
+    init_val16_buffer(y, MAX_LEN_INNER_PROD);
+    for (N = 0; N <= MAX_LEN_INNER_PROD; N++) {
+#ifdef FIXED_POINT
+        xy_org = celt_inner_prod_c(x, y, N);
+#else
+        xy_org = celt_inner_prod_float_simulation(x, y, N, arch);
+#endif
+        xy_opt = celt_inner_prod(x, y, N, arch);
+        if (xy_org != xy_opt) {
+#ifdef FIXED_POINT
+            printf("\nN=%d xy_org = %d, xy_opt = %d failed!", N, xy_org, xy_opt);
+#else
+            printf("\nN=%d xy_org = %f, xy_opt = %f failed!", N, xy_org, xy_opt);
+#endif
+            return -1;
+        }
+    }
+    printf(" passed!\n");
+    return 0;
+}
+
+static int test_dual_inner_prod(int arch)
+{
+    opus_val16 x[MAX_LEN_INNER_PROD], y01[MAX_LEN_INNER_PROD], y02[MAX_LEN_INNER_PROD];
+    opus_val32 xy1_org, xy1_opt, xy2_org, xy2_opt;
+    int N;
+
+    printf("%44s() ...", __func__);
+    init_val16_buffer(x,   MAX_LEN_INNER_PROD);
+    init_val16_buffer(y01, MAX_LEN_INNER_PROD);
+    init_val16_buffer(y02, MAX_LEN_INNER_PROD);
+    for (N = 0; N <= MAX_LEN_INNER_PROD; N++) {
+#ifdef FIXED_POINT
+        dual_inner_prod_c(x, y01, y02, N, &xy1_org, &xy2_org);
+#else
+        dual_inner_prod_float_simulation(x, y01, y02, N, &xy1_org, &xy2_org, arch);
+#endif
+        dual_inner_prod(x, y01, y02, N, &xy1_opt, &xy2_opt, arch);
+        if ((xy1_org != xy1_opt) || (xy2_org != xy2_opt)) {
+#ifdef FIXED_POINT
+            printf("\nN=%d xy1_org = %d, xy1_opt = %d failed!", N, xy1_org, xy1_opt);
+            printf("\nN=%d xy2_org = %d, xy2_opt = %d failed!", N, xy2_org, xy2_opt);
+#else
+            printf("\nN=%d xy1_org = %f, xy1_opt = %f failed!", N, xy1_org, xy1_opt);
+            printf("\nN=%d xy2_org = %f, xy2_opt = %f failed!", N, xy2_org, xy2_opt);
+#endif
+            return -1;
+        }
+    }
+    printf(" passed!\n");
+    return 0;
+}
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index 785b40d..0b73839 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -67,6 +67,7 @@
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #  include "arm/celt_lpc_neon_intr.c"
 #  include "arm/celt_neon_intr.c"
+#  include "arm/pitch_neon_intr.c"
 #  if defined(HAVE_ARM_NE10)
 #   include "kiss_fft.c"
 #   include "mdct.c"
diff --git a/celt/x86/pitch_sse.h b/celt/x86/pitch_sse.h
index e5f87ab..5e85599 100644
--- a/celt/x86/pitch_sse.h
+++ b/celt/x86/pitch_sse.h
@@ -91,7 +91,7 @@ opus_val32 celt_inner_prod_sse2(
     int               N);
 #endif
 
-#if defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(FIXED_POINT)
+#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT)
 opus_val32 celt_inner_prod_sse(
     const opus_val16 *x,
     const opus_val16 *y,
@@ -104,7 +104,7 @@ opus_val32 celt_inner_prod_sse(
 #define celt_inner_prod(x, y, N, arch) \
     ((void)arch, celt_inner_prod_sse4_1(x, y, N))
 
-#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+#elif defined(OPUS_X86_PRESUME_SSE2) && defined(FIXED_POINT)
 #define OVERRIDE_CELT_INNER_PROD
 #define celt_inner_prod(x, y, N, arch) \
     ((void)arch, celt_inner_prod_sse2(x, y, N))
@@ -114,7 +114,6 @@ opus_val32 celt_inner_prod_sse(
 #define celt_inner_prod(x, y, N, arch) \
     ((void)arch, celt_inner_prod_sse(x, y, N))
 
-
 #elif ((defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) && defined(FIXED_POINT)) || \
     (defined(OPUS_X86_MAY_HAVE_SSE) && !defined(FIXED_POINT))
 
diff --git a/celt_sources.mk b/celt_sources.mk
index c4bd285..dc107d9 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -38,7 +38,8 @@ celt/arm/armopts.s.in
 
 CELT_SOURCES_ARM_NEON_INTR = \
 celt/arm/celt_lpc_neon_intr.c \
-celt/arm/celt_neon_intr.c
+celt/arm/celt_neon_intr.c \
+celt/arm/pitch_neon_intr.c
 
 CELT_SOURCES_ARM_NE10= \
 celt/arm/celt_ne10_fft.c \
diff --git a/tests/test_unit_optimization.c b/tests/test_unit_optimization.c
index 6155dfb..a88ac21 100644
--- a/tests/test_unit_optimization.c
+++ b/tests/test_unit_optimization.c
@@ -71,6 +71,7 @@
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #  include "celt/arm/celt_lpc_neon_intr.c"
 #  include "celt/arm/celt_neon_intr.c"
+#  include "celt/arm/pitch_neon_intr.c"
 #  include "silk/arm/biquad_alt_neon_intr.c"
 #  include "silk/arm/inner_prod_aligned_neon_intr.c"
 #  include "silk/arm/LPC_analysis_filter_neon_intr.c"
@@ -94,6 +95,7 @@
 
 #endif
 
+# include "celt/tests/test_unit_optimization_pitch.c"
 # include "silk/tests/test_unit_optimization_biquad_alt.c"
 # include "silk/tests/test_unit_optimization_inner_prod_aligned.c"
 # include "silk/tests/test_unit_optimization_LPC_analysis_filter.c"
@@ -118,6 +120,8 @@ int main(void)
       result |= test_silk_LPC_inverse_pred_gain_Q24(arch);
       result |= test_warped_autocorrelation(arch);
 #endif /* FIXED_POINT */
+      result |= test_celt_inner_prod(arch);
+      result |= test_dual_inner_prod(arch);
       result |= test_silk_biquad_alt(arch);
       result |= test_silk_inner_prod_aligned_scale(arch);
       result |= test_silk_LPC_analysis_filter(arch);
-- 
2.8.0.rc3.226.g39d4020