[opus] [RFC PATCHv1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics

Fri Nov 28 13:52:12 PST 2014

Review comments inline.

> +if OPUS_ARM_NEON_INTR
> +noinst_LTLIBRARIES = libarmneon.la
> +libarmneon_la_SOURCES = $(CELT_SOURCES_ARM_NEON_INTR)
> +libarmneon_la_CPPFLAGS = $(OPUS_ARM_NEON_INTR_CPPFLAGS) -I$(top_srcdir)/include
> +endif

I don't think these should be in a separate library. It brings with it 
lots of complications (to name one: wouldn't the .pc files need to be 
updated?). Please use the same mechanism that the SSE intrinsics use to 
add CFLAGS to the compilation of specific object files, e.g.,

$(SSE_OBJ): CFLAGS += -msse4.1

> +void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
> +    const opus_val16 *, opus_val32 *, int , int, int) = {
> +  celt_pitch_xcorr_c,               	/* ARMv4 */
> +  celt_pitch_xcorr_c,  		 	/* EDSP */
> +  celt_pitch_xcorr_c,			/* Media */
> +#if defined(OPUS_ARM_NEON_INTR)
> +  celt_pitch_xcorr_float_neon		 /* Neon */

Please do not use tabs in source code (this applies here and everywhere 
below). Even with the tabs expanded in context, the comments here do not 
line up properly.

> +static void xcorr_kernel_neon_float(float *x, float *y, float sum[4], int len) {

x and y should be const.

> +	float32x4_t YY[5];
> +	float32x4_t XX[4];
> +	float32x2_t XX_2;
> +	float32x4_t SUMM[4];
> +	float *xi = x;
> +	float *yi = y;
> +	int cd = len/4;
> +	int cr = len%4;

len is signed, so / and % are NOT equivalent to the corresponding >> and 
& (they are much slower).

> +	int j;
> +
> +	celt_assert(len>=3);
> +
> +	/* Initialize sums to 0 */
> +	SUMM[0] = vdupq_n_f32(0);
> +	SUMM[1] = vdupq_n_f32(0);
> +	SUMM[2] = vdupq_n_f32(0);
> +	SUMM[3] = vdupq_n_f32(0);
> +
> +	YY[0] = vld1q_f32(yi);
> +
> +	/* Each loop consumes 8 floats in y vector
> +	 * and 4 floats in x vector
> +	 */
> +	for (j = 0; j < cd; j++) {
> +		yi += 4;
> +		YY[4] = vld1q_f32(yi);

If len == 4, then in the first iteration you will have loaded 8 y 
values, but only 7 are guaranteed to be available (e.g., the C code only 
references y[0] up to y[len-1+3]). You need to end this loop early and 
fall back to another approach. See comments in celt_pitch_xcorr_arm.s 
for details and an example (there are other useful comments there that 
could shave another cycle or two from this inner loop).

> +		YY[1] = vextq_f32(YY[0], YY[4], 1);
> +		YY[2] = vextq_f32(YY[0], YY[4], 2);
> +		YY[3] = vextq_f32(YY[0], YY[4], 3);
> +
> +		XX[0] = vld1q_dup_f32(xi++);
> +		XX[1] = vld1q_dup_f32(xi++);
> +		XX[2] = vld1q_dup_f32(xi++);
> +		XX[3] = vld1q_dup_f32(xi++);

Don't do this. Do a single load and use vmlaq_lane_f32() to multiply by 
each value. That should cut at least 5 cycles out of this loop.

> +
> +		SUMM[0] = vmlaq_f32(SUMM[0], XX[0], YY[0]);
> +		SUMM[1] = vmlaq_f32(SUMM[1], XX[1], YY[1]);
> +		SUMM[2] = vmlaq_f32(SUMM[2], XX[2], YY[2]);
> +		SUMM[3] = vmlaq_f32(SUMM[3], XX[3], YY[3]);
> +		YY[0] = YY[4];
> +	}
> +
> +	/* Handle remaining values max iterations = 3 */
> +	for (j = 0; j < cr; j++) {
> +		YY[0] = vld1q_f32(yi++);

This load is always redundant in the first iteration, which is a bit 
unfortunate.

> +		XX_2 = vld1_lane_f32(xi++, XX_2, 0);

Don't load a single lane when you don't need the value(s) in the other 
lane(s). Use vld1_dup_f32() instead. It's faster and breaks dependencies.

> +		SUMM[0] = vmlaq_lane_f32(SUMM[0], YY[0], XX_2, 0);
> +	}
> +
> +	SUMM[0] = vaddq_f32(SUMM[0], SUMM[1]);
> +	SUMM[2] = vaddq_f32(SUMM[2], SUMM[3]);
> +	SUMM[0] = vaddq_f32(SUMM[0], SUMM[2]);
> +
> +	vst1q_f32(sum, SUMM[0]);
> +}
> +
> +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
> +			opus_val32 *xcorr, int len, int max_pitch, int arch) {

arch is unused. There's no reason to pass it here. If we're here, we 
know what the arch is.

> +	int i, j;
> +
> +	celt_assert(max_pitch > 0);
> +	celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
> +
> +	for (i = 0; i < (max_pitch-3); i += 4) {
> +		xcorr_kernel_neon_float((float *)_x, (float *)_y+i,
> +					(float *)xcorr+i, len);
> +	}
> +
> +	/* In case max_pitch isn't multiple of 4, do unrolled version */
> +	for (; i < max_pitch; i++) {
> +		float sum = 0;
> +		float *yi = _y+i;
> +		for (j = 0; j < len; j++)
> +			sum += _x[i]*yi[i];
> +		xcorr[i] = sum;
> +	}

This loop can still be largely vectorized. Any reason not to do so?

> +}
> diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
> index a07f8ac..f5adc48 100644
> --- a/celt/arm/pitch_arm.h
> +++ b/celt/arm/pitch_arm.h
> @@ -52,6 +52,19 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
>     ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
>   #  endif
>
> -# endif
> +#else /* Start !FIXED_POINT */
> +/* Float case */
> +#if defined(OPUS_ARM_NEON_INTR)
> +void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
> +				opus_val32 *xcorr, int len, int max_pitch, int arch);
> +#endif
> +
> +#if !defined(OPUS_HAVE_RTCD) && defined(OPUS_PRESUME_ARM_NEON_INTR)
> +#define OVERRIDE_PITCH_XCORR (1)
> +#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
> +	((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, \
> +						len, max_pitch, arch))

Again, don't pass arch.

>   (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
> -      const opus_val16 *, opus_val32 *, int, int);
> +      const opus_val16 *, opus_val32 *, int, int
> +#if !defined(FIXED_POINT)
> +	,int
> +#endif
> +);

Which gets rid of this ugliness.

> +#if defined(FIXED_POINT)
> +#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
> +  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
> +        xcorr, len, max_pitch)
> +#else
>   #  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
>     ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
> -        xcorr, len, max_pitch))
> +        xcorr, len, max_pitch, arch))
> +#endif
> +

And this.

> diff --git a/configure.ac b/configure.ac
> index 9b2f51f..09657b6 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -198,12 +198,11 @@ cpu_arm=no
>
>   AS_IF([test x"${enable_asm}" = x"yes"],[
>       inline_optimization="No ASM for your platform, please send patches"
> +    OPUS_ARM_NEON_INTR_CPPFLAGS=
>       case $host_cpu in
>         arm*)
> -        dnl Currently we only have asm for fixed-point
> -        AS_IF([test "$enable_float" != "yes"],[
>               cpu_arm=yes
> -            AC_DEFINE([OPUS_ARM_ASM], [],  [Make use of ARM asm optimization])
> +            AC_DEFINE([OPUS_ARM_ASM], [],  [Make use of ARM asm/intrinsic optimization])

Not sure I'm in love with conflating intrinsics with inline assembly. 
For example, are these tests (especially the PRESUME_NEON stuff) going 
to do the right thing on aarch64?

>               AS_GCC_INLINE_ASSEMBLY(
>                   [inline_optimization="ARM"],
>                   [inline_optimization="disabled"]
> @@ -212,6 +211,35 @@ AS_IF([test x"${enable_asm}" = x"yes"],[
>               AS_ASM_ARM_MEDIA([OPUS_ARM_INLINE_MEDIA=1],
>                   [OPUS_ARM_INLINE_MEDIA=0])
>               AS_ASM_ARM_NEON([OPUS_ARM_INLINE_NEON=1],[OPUS_ARM_INLINE_NEON=0])
> +
> +	    AC_ARG_ENABLE([arm-neon-intrinsics],
> +		AS_HELP_STRING([--enable-arm-neon-intrinsics], [Enable NEON optimisations on ARM CPUs that support it]))

This should specify a default value for enable_arm_neon_intrinsics. 
However, I really think this switch should be unified with the 
--enable-intrinsics switch currently used by x86.

> +
> +	    AS_IF([test x"$enable_arm_neon_intrinsics" = x"yes"],
> +		[
> +	        AC_MSG_CHECKING(if compiler supports arm neon intrinsics)
> +		save_CFLAGS="$CFLAGS"
> +		save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS"
> +		AC_COMPILE_IFELSE(
> +		    [AC_LANG_PROGRAM([[#include <arm_neon.h>]], [])],
> +		    [
> +			OPUS_ARM_NEON_INTR=1
> +			OPUS_ARM_NEON_INTR_CPPFLAGS="-mfpu=neon -O3"
> +			AC_SUBST(OPUS_ARM_NEON_INTR_CPPFLAGS)
> +		    ],
> +		    [
> +			OPUS_ARM_NEON_INTR=0
> +		    ])
> +		CFLAGS="$save_CFLAGS"
> +		AS_IF([test x"$OPUS_ARM_NEON_INTR"=x"1"],
> +		    [AC_MSG_RESULT([yes])],
> +		    [AC_MSG_RESULT([no])])
> +		],
> +		[
> +		    OPUS_ARM_NEON_INTR=0
> +		    AC_MSG_WARN([ARMv7 neon intrinsics not enabled])
> +		])
> +
>               AS_IF([test x"$inline_optimization" = x"ARM"],[
>                   AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],[true])
>                   AC_DEFINE([OPUS_ARM_INLINE_ASM], 1,
> @@ -220,7 +248,7 @@ AS_IF([test x"${enable_asm}" = x"yes"],[
>                       AC_DEFINE([OPUS_ARM_INLINE_EDSP], [1],
>                           [Use ARMv5E inline asm optimizations])
>                       inline_optimization="$inline_optimization (EDSP)"
> -                ])
> +                ]n)

Buh?

>                   AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" = x"1"],[
>                       AC_DEFINE([OPUS_ARM_INLINE_MEDIA], [1],
>                           [Use ARMv6 inline asm optimizations])
> @@ -335,13 +363,20 @@ AS_IF([test x"${enable_asm}" = x"yes"],[
>                     [*** ARM assembly requires perl -- disabling optimizations])
>                   asm_optimization="(missing perl dependency for ARM)"
>               ])
> -        ])
> +	   AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"], [
> +		AC_DEFINE([OPUS_ARM_NEON_INTR], 1,
> +			[Compiler supports ARMv7 Neon Intrinsics]),
> +		AS_IF([test x"$OPUS_ARM_PRESUME_NEON" = x"1"], [
> +			AC_DEFINE([OPUS_PRESUME_NEON_INTR], 1,
> +			    [Compiler support arm Intrinsics and target must support neon])],
> +			[])
> +		AS_IF([test x"enable_rtcd" != x""],
> +			[rtcd_support="$rtcd_support (NEON_INTR)"],
> +			[])
> +		],[])
>           ;;
>       esac
> -],[
> -   inline_optimization="disabled"
> -   asm_optimization="disabled"
> -])
> +],[])
>
>   AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
>   AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
> @@ -349,6 +384,9 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
>   AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
>       [test x"${asm_optimization%% *}" = x"ARM"])
>
> +AM_CONDITIONAL([OPUS_ARM_NEON_INTR],
> +    [test x"$OPUS_ARM_NEON_INTR" = x"1"])
> +
>   AM_CONDITIONAL([HAVE_SSE4_1], [false])
>   AM_CONDITIONAL([HAVE_SSE2], [false])
>   AS_IF([test x"$enable_intrinsics" = x"yes"],[