[opus] [RFC PATCHv1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics
Timothy B. Terriberry
tterribe at xiph.org
Fri Nov 28 13:52:12 PST 2014
Review comments inline.
> +if OPUS_ARM_NEON_INTR
> +noinst_LTLIBRARIES = libarmneon.la
> +libarmneon_la_SOURCES = $(CELT_SOURCES_ARM_NEON_INTR)
> +libarmneon_la_CPPFLAGS = $(OPUS_ARM_NEON_INTR_CPPFLAGS) -I$(top_srcdir)/include
> +endif
I don't think these should be in a separate library. It brings with it
lots of complications (to name one: wouldn't the .pc files need to be
updated?). Please use the same mechanism that the SSE intrinsics use to
add CFLAGS to the compilation of specific object files, e.g.,
$(SSE_OBJ): CFLAGS += -msse4.1
> +void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
> + const opus_val16 *, opus_val32 *, int , int, int) = {
> + celt_pitch_xcorr_c, /* ARMv4 */
> + celt_pitch_xcorr_c, /* EDSP */
> + celt_pitch_xcorr_c, /* Media */
> +#if defined(OPUS_ARM_NEON_INTR)
> + celt_pitch_xcorr_float_neon /* Neon */
Please do not use tabs in source code (this applies here and everywhere
below). Even with the tabs expanded in context, the comments here do not
line up properly.
> +static void xcorr_kernel_neon_float(float *x, float *y, float sum[4], int len) {
x and y should be const.
> + float32x4_t YY[5];
> + float32x4_t XX[4];
> + float32x2_t XX_2;
> + float32x4_t SUMM[4];
> + float *xi = x;
> + float *yi = y;
> + int cd = len/4;
> + int cr = len%4;
len is signed, so / and % are NOT equivalent to the corresponding >> and
& (they are much slower).
> + int j;
> +
> + celt_assert(len>=3);
> +
> + /* Initialize sums to 0 */
> + SUMM[0] = vdupq_n_f32(0);
> + SUMM[1] = vdupq_n_f32(0);
> + SUMM[2] = vdupq_n_f32(0);
> + SUMM[3] = vdupq_n_f32(0);
> +
> + YY[0] = vld1q_f32(yi);
> +
> + /* Each loop consumes 8 floats in y vector
> + * and 4 floats in x vector
> + */
> + for (j = 0; j < cd; j++) {
> + yi += 4;
> + YY[4] = vld1q_f32(yi);
If len == 4, then in the first iteration you will have loaded 8 y
values, but only 7 are guaranteed to be available (e.g., the C code only
references y[0] up to y[len-1+3]). You need to end this loop early and
fall back to another approach. See comments in celt_pitch_xcorr_arm.s
for details and an example (there are other useful comments there that
could shave another cycle or two from this inner loop).
> + YY[1] = vextq_f32(YY[0], YY[4], 1);
> + YY[2] = vextq_f32(YY[0], YY[4], 2);
> + YY[3] = vextq_f32(YY[0], YY[4], 3);
> +
> + XX[0] = vld1q_dup_f32(xi++);
> + XX[1] = vld1q_dup_f32(xi++);
> + XX[2] = vld1q_dup_f32(xi++);
> + XX[3] = vld1q_dup_f32(xi++);
Don't do this. Do a single load and use vmlaq_lane_f32() to multiply by
each value. That should cut at least 5 cycles out of this loop.
> +
> + SUMM[0] = vmlaq_f32(SUMM[0], XX[0], YY[0]);
> + SUMM[1] = vmlaq_f32(SUMM[1], XX[1], YY[1]);
> + SUMM[2] = vmlaq_f32(SUMM[2], XX[2], YY[2]);
> + SUMM[3] = vmlaq_f32(SUMM[3], XX[3], YY[3]);
> + YY[0] = YY[4];
> + }
> +
> + /* Handle remaining values max iterations = 3 */
> + for (j = 0; j < cr; j++) {
> + YY[0] = vld1q_f32(yi++);
This load is always redundant in the first iteration, which is a bit
unfortunate.
> + XX_2 = vld1_lane_f32(xi++, XX_2, 0);
Don't load a single lane when you don't need the value(s) in the other
lane(s). Use vld1_dup_f32() instead. It's faster and breaks dependencies.
> + SUMM[0] = vmlaq_lane_f32(SUMM[0], YY[0], XX_2, 0);
> + }
> +
> + SUMM[0] = vaddq_f32(SUMM[0], SUMM[1]);
> + SUMM[2] = vaddq_f32(SUMM[2], SUMM[3]);
> + SUMM[0] = vaddq_f32(SUMM[0], SUMM[2]);
> +
> + vst1q_f32(sum, SUMM[0]);
> +}
> +
> +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
> + opus_val32 *xcorr, int len, int max_pitch, int arch) {
arch is unused. There's no reason to pass it here. If we're here, we
know what the arch is.
> + int i, j;
> +
> + celt_assert(max_pitch > 0);
> + celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
> +
> + for (i = 0; i < (max_pitch-3); i += 4) {
> + xcorr_kernel_neon_float((float *)_x, (float *)_y+i,
> + (float *)xcorr+i, len);
> + }
> +
> + /* In case max_pitch isn't multiple of 4, do unrolled version */
> + for (; i < max_pitch; i++) {
> + float sum = 0;
> + float *yi = _y+i;
> + for (j = 0; j < len; j++)
> + sum += _x[i]*yi[i];
> + xcorr[i] = sum;
> + }
This loop can still be largely vectorized. Any reason not to do so?
> +}
> diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
> index a07f8ac..f5adc48 100644
> --- a/celt/arm/pitch_arm.h
> +++ b/celt/arm/pitch_arm.h
> @@ -52,6 +52,19 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
> ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
> # endif
>
> -# endif
> +#else /* Start !FIXED_POINT */
> +/* Float case */
> +#if defined(OPUS_ARM_NEON_INTR)
> +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
> + opus_val32 *xcorr, int len, int max_pitch, int arch);
> +#endif
> +
> +#if !defined(OPUS_HAVE_RTCD) && defined(OPUS_PRESUME_ARM_NEON_INTR)
> +#define OVERRIDE_PITCH_XCORR (1)
> +#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
> + ((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, \
> + len, max_pitch, arch))
Again, don't pass arch.
> (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
> - const opus_val16 *, opus_val32 *, int, int);
> + const opus_val16 *, opus_val32 *, int, int
> +#if !defined(FIXED_POINT)
> + ,int
> +#endif
> +);
Which gets rid of this ugliness.
> +#if defined(FIXED_POINT)
> +# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
> + ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
> + xcorr, len, max_pitch)
> +#else
> # define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
> ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
> - xcorr, len, max_pitch))
> + xcorr, len, max_pitch, arch))
> +#endif
> +
And this.
> diff --git a/configure.ac b/configure.ac
> index 9b2f51f..09657b6 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -198,12 +198,11 @@ cpu_arm=no
>
> AS_IF([test x"${enable_asm}" = x"yes"],[
> inline_optimization="No ASM for your platform, please send patches"
> + OPUS_ARM_NEON_INTR_CPPFLAGS=
> case $host_cpu in
> arm*)
> - dnl Currently we only have asm for fixed-point
> - AS_IF([test "$enable_float" != "yes"],[
> cpu_arm=yes
> - AC_DEFINE([OPUS_ARM_ASM], [], [Make use of ARM asm optimization])
> + AC_DEFINE([OPUS_ARM_ASM], [], [Make use of ARM asm/intrinsic optimization])
Not sure I'm in love with conflating intrinsics with inline assembly.
For example, are these tests (especially the PRESUME_NEON stuff) going
to do the right thing on aarch64?
> AS_GCC_INLINE_ASSEMBLY(
> [inline_optimization="ARM"],
> [inline_optimization="disabled"]
> @@ -212,6 +211,35 @@ AS_IF([test x"${enable_asm}" = x"yes"],[
> AS_ASM_ARM_MEDIA([OPUS_ARM_INLINE_MEDIA=1],
> [OPUS_ARM_INLINE_MEDIA=0])
> AS_ASM_ARM_NEON([OPUS_ARM_INLINE_NEON=1],[OPUS_ARM_INLINE_NEON=0])
> +
> + AC_ARG_ENABLE([arm-neon-intrinsics],
> + AS_HELP_STRING([--enable-arm-neon-intrinsics], [Enable NEON optimisations on ARM CPUs that support it]))
This should specify a default value for enable_arm_neon_intrinsics.
However, I really think this switch should be unified with the
--enable-intrinsics switch currently used by x86.
> +
> + AS_IF([test x"$enable_arm_neon_intrinsics" = x"yes"],
> + [
> + AC_MSG_CHECKING(if compiler supports arm neon intrinsics)
> + save_CFLAGS="$CFLAGS"
> + save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS"
> + AC_COMPILE_IFELSE(
> + [AC_LANG_PROGRAM([[#include <arm_neon.h>]], [])],
> + [
> + OPUS_ARM_NEON_INTR=1
> + OPUS_ARM_NEON_INTR_CPPFLAGS="-mfpu=neon -O3"
> + AC_SUBST(OPUS_ARM_NEON_INTR_CPPFLAGS)
> + ],
> + [
> + OPUS_ARM_NEON_INTR=0
> + ])
> + CFLAGS="$save_CFLAGS"
> + AS_IF([test x"$OPUS_ARM_NEON_INTR"=x"1"],
> + [AC_MSG_RESULT([yes])],
> + [AC_MSG_RESULT([no])])
> + ],
> + [
> + OPUS_ARM_NEON_INTR=0
> + AC_MSG_WARN([ARMv7 neon intrinsics not enabled])
> + ])
> +
> AS_IF([test x"$inline_optimization" = x"ARM"],[
> AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],[true])
> AC_DEFINE([OPUS_ARM_INLINE_ASM], 1,
> @@ -220,7 +248,7 @@ AS_IF([test x"${enable_asm}" = x"yes"],[
> AC_DEFINE([OPUS_ARM_INLINE_EDSP], [1],
> [Use ARMv5E inline asm optimizations])
> inline_optimization="$inline_optimization (EDSP)"
> - ])
> + ]n)
Buh?
> AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" = x"1"],[
> AC_DEFINE([OPUS_ARM_INLINE_MEDIA], [1],
> [Use ARMv6 inline asm optimizations])
> @@ -335,13 +363,20 @@ AS_IF([test x"${enable_asm}" = x"yes"],[
> [*** ARM assembly requires perl -- disabling optimizations])
> asm_optimization="(missing perl dependency for ARM)"
> ])
> - ])
> + AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"], [
> + AC_DEFINE([OPUS_ARM_NEON_INTR], 1,
> + [Compiler supports ARMv7 Neon Intrinsics]),
> + AS_IF([test x"$OPUS_ARM_PRESUME_NEON" = x"1"], [
> + AC_DEFINE([OPUS_PRESUME_NEON_INTR], 1,
> + [Compiler support arm Intrinsics and target must support neon])],
> + [])
> + AS_IF([test x"enable_rtcd" != x""],
> + [rtcd_support="$rtcd_support (NEON_INTR)"],
> + [])
> + ],[])
> ;;
> esac
> -],[
> - inline_optimization="disabled"
> - asm_optimization="disabled"
> -])
> +],[])
>
> AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
> AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
> @@ -349,6 +384,9 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
> AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
> [test x"${asm_optimization%% *}" = x"ARM"])
>
> +AM_CONDITIONAL([OPUS_ARM_NEON_INTR],
> + [test x"$OPUS_ARM_NEON_INTR" = x"1"])
> +
> AM_CONDITIONAL([HAVE_SSE4_1], [false])
> AM_CONDITIONAL([HAVE_SSE2], [false])
> AS_IF([test x"$enable_intrinsics" = x"yes"],[
More information about the opus
mailing list