[opus] [RFC PATCHv1] armv7: celt_pitch_xcorr: Introduce ARM neon intrinsics

Viswanath Puttagunta viswanath.puttagunta at linaro.org
Fri Nov 21 15:38:50 PST 2014


Optimize celt_pitch_xcorr function (for floating point)
using ARM NEON intrinsics for SoCs that have NEON VFP unit.

As initial step, targeting ARMv7 NEON (VFP3+) based SoCs.
To enable this optimization, use --enable-arm-neon-intrinsics
configure option. This flag is not enabled by default.

Compile time and runtime checks are also supported to make sure
this optimization is only enabled when the compiler supports
neon intrinsics.
---
 Makefile.am                     |   23 +++++++++++
 celt/_kiss_fft_guts.h           |    4 +-
 celt/arch.h                     |    4 +-
 celt/arm/arm_celt_map.c         |   15 +++++++-
 celt/arm/celt_neon_intr.c       |   81 +++++++++++++++++++++++++++++++++++++++
 celt/arm/pitch_arm.h            |   15 +++++++-
 celt/pitch.h                    |   17 ++++++--
 celt/tests/test_unit_mathops.c  |    2 +-
 celt/tests/test_unit_rotation.c |    2 +-
 celt_sources.mk                 |    3 ++
 configure.ac                    |   56 ++++++++++++++++++++++-----
 silk/SigProc_FIX.h              |    4 +-
 silk/macros.h                   |    4 +-
 13 files changed, 205 insertions(+), 25 deletions(-)
 create mode 100644 celt/arm/celt_neon_intr.c

diff --git a/Makefile.am b/Makefile.am
index e20f7b4..0e9e120 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -38,6 +38,12 @@ CELT_SOURCES += $(CELT_SOURCES_SSE)
 endif
 endif
 
+if OPUS_ARM_NEON_INTR
+noinst_LTLIBRARIES = libarmneon.la
+libarmneon_la_SOURCES = $(CELT_SOURCES_ARM_NEON_INTR)
+libarmneon_la_CPPFLAGS = $(OPUS_ARM_NEON_INTR_CPPFLAGS) -I$(top_srcdir)/include
+endif
+
 if CPU_ARM
 CELT_SOURCES += $(CELT_SOURCES_ARM)
 SILK_SOURCES += $(SILK_SOURCES_ARM)
@@ -59,6 +65,9 @@ include opus_headers.mk
 libopus_la_SOURCES = $(CELT_SOURCES) $(SILK_SOURCES) $(OPUS_SOURCES)
 libopus_la_LDFLAGS = -no-undefined -version-info @OPUS_LT_CURRENT@:@OPUS_LT_REVISION@:@OPUS_LT_AGE@
 libopus_la_LIBADD = $(LIBM)
+if OPUS_ARM_NEON_INTR
+libopus_la_LIBADD += ./libarmneon.la
+endif
 
 pkginclude_HEADERS = include/opus.h include/opus_multistream.h include/opus_types.h include/opus_defines.h
 
@@ -97,6 +106,11 @@ celt_tests_test_unit_cwrs32_LDADD = $(LIBM)
 
 celt_tests_test_unit_dft_SOURCES = celt/tests/test_unit_dft.c
 celt_tests_test_unit_dft_LDADD = $(LIBM)
+if OPUS_ARM_NEON_INTR
+celt_tests_test_unit_dft_LDADD += ./libarmneon.la
+endif
+
+
 
 celt_tests_test_unit_entropy_SOURCES = celt/tests/test_unit_entropy.c
 celt_tests_test_unit_entropy_LDADD = $(LIBM)
@@ -110,10 +124,16 @@ if CPU_ARM
 if OPUS_ARM_EXTERNAL_ASM
 celt_tests_test_unit_mathops_LDADD += libopus.la
 endif
+if OPUS_ARM_NEON_INTR
+celt_tests_test_unit_mathops_LDADD += ./libarmneon.la
+endif
 endif
 
 celt_tests_test_unit_mdct_SOURCES = celt/tests/test_unit_mdct.c
 celt_tests_test_unit_mdct_LDADD = $(LIBM)
+if OPUS_ARM_NEON_INTR
+celt_tests_test_unit_mdct_LDADD += ./libarmneon.la
+endif
 
 celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c
 celt_tests_test_unit_rotation_LDADD = $(LIBM)
@@ -121,6 +141,9 @@ if CPU_ARM
 if OPUS_ARM_EXTERNAL_ASM
 celt_tests_test_unit_rotation_LDADD += libopus.la
 endif
+if OPUS_ARM_NEON_INTR
+celt_tests_test_unit_rotation_LDADD += ./libarmneon.la
+endif
 endif
 
 celt_tests_test_unit_types_SOURCES = celt/tests/test_unit_types.c
diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h
index 5e3d58f..11a2676 100644
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -90,11 +90,11 @@
     do {(res).r = ADD32((res).r,(a).r);  (res).i = SUB32((res).i,(a).i); \
     }while(0)
 
-#if defined(OPUS_ARM_INLINE_ASM)
+#if defined(OPUS_ARM_INLINE_ASM) && defined(FIXED_POINT)
 #include "arm/kiss_fft_armv4.h"
 #endif
 
-#if defined(OPUS_ARM_INLINE_EDSP)
+#if defined(OPUS_ARM_INLINE_EDSP) && defined(FIXED_POINT)
 #include "arm/kiss_fft_armv5e.h"
 #endif
 #if defined(MIPSr1_ASM)
diff --git a/celt/arch.h b/celt/arch.h
index 9f74ddd..0e3061c 100644
--- a/celt/arch.h
+++ b/celt/arch.h
@@ -118,9 +118,9 @@ static OPUS_INLINE opus_int16 SAT16(opus_int32 x) {
 
 #include "fixed_generic.h"
 
-#ifdef OPUS_ARM_INLINE_EDSP
+#if defined(OPUS_ARM_INLINE_EDSP) && defined(FIXED_POINT)
 #include "arm/fixed_armv5e.h"
-#elif defined (OPUS_ARM_INLINE_ASM)
+#elif defined(OPUS_ARM_INLINE_ASM) && defined(FIXED_POINT)
 #include "arm/fixed_armv4.h"
 #elif defined (BFIN_ASM)
 #include "fixed_bfin.h"
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index 547a84d..a6313e7 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -42,8 +42,19 @@ opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
   MAY_HAVE_NEON(celt_pitch_xcorr)   /* NEON */
 };
 # else
-#  error "Floating-point implementation is not supported by ARM asm yet." \
- "Reconfigure with --disable-rtcd or send patches."
+
+void (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+    const opus_val16 *, opus_val32 *, int , int, int) = {
+  celt_pitch_xcorr_c,               	/* ARMv4 */
+  celt_pitch_xcorr_c,  		 	/* EDSP */
+  celt_pitch_xcorr_c,			/* Media */
+#if defined(OPUS_ARM_NEON_INTR)
+  celt_pitch_xcorr_float_neon		 /* Neon */
+#else
+  celt_pitch_xcorr_c
+#endif
+};
+
 # endif
 
 #endif
diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c
new file mode 100644
index 0000000..88954fb
--- /dev/null
+++ b/celt/arm/celt_neon_intr.c
@@ -0,0 +1,81 @@
+#include <arm_neon.h>
+#include "../arch.h"
+
+static void xcorr_kernel_neon_float(float *x, float *y, float sum[4], int len) {
+	float32x4_t YY[5];
+	float32x4_t XX[4];
+	float32x2_t XX_2;
+	float32x4_t SUMM[4];
+	float *xi = x;
+	float *yi = y;
+	int cd = len/4;
+	int cr = len%4;
+	int j;
+
+	celt_assert(len>=3);
+
+	/* Initialize sums to 0 */
+	SUMM[0] = vdupq_n_f32(0);
+	SUMM[1] = vdupq_n_f32(0);
+	SUMM[2] = vdupq_n_f32(0);
+	SUMM[3] = vdupq_n_f32(0);
+
+	YY[0] = vld1q_f32(yi);
+
+	/* Each loop consumes 8 floats in y vector
+	 * and 4 floats in x vector
+	 */
+	for (j = 0; j < cd; j++) {
+		yi += 4;
+		YY[4] = vld1q_f32(yi);
+		YY[1] = vextq_f32(YY[0], YY[4], 1);
+		YY[2] = vextq_f32(YY[0], YY[4], 2);
+		YY[3] = vextq_f32(YY[0], YY[4], 3);
+
+		XX[0] = vld1q_dup_f32(xi++);
+		XX[1] = vld1q_dup_f32(xi++);
+		XX[2] = vld1q_dup_f32(xi++);
+		XX[3] = vld1q_dup_f32(xi++);
+
+		SUMM[0] = vmlaq_f32(SUMM[0], XX[0], YY[0]);
+		SUMM[1] = vmlaq_f32(SUMM[1], XX[1], YY[1]);
+		SUMM[2] = vmlaq_f32(SUMM[2], XX[2], YY[2]);
+		SUMM[3] = vmlaq_f32(SUMM[3], XX[3], YY[3]);
+		YY[0] = YY[4];
+	}
+
+	/* Handle remaining values max iterations = 3 */
+	for (j = 0; j < cr; j++) {
+		YY[0] = vld1q_f32(yi++);
+		XX_2 = vld1_lane_f32(xi++, XX_2, 0);
+		SUMM[0] = vmlaq_lane_f32(SUMM[0], YY[0], XX_2, 0);
+	}
+
+	SUMM[0] = vaddq_f32(SUMM[0], SUMM[1]);
+	SUMM[2] = vaddq_f32(SUMM[2], SUMM[3]);
+	SUMM[0] = vaddq_f32(SUMM[0], SUMM[2]);
+
+	vst1q_f32(sum, SUMM[0]);
+}
+
+void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
+			opus_val32 *xcorr, int len, int max_pitch, int arch) {
+	int i, j;
+
+	celt_assert(max_pitch > 0);
+	celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+
+	for (i = 0; i < (max_pitch-3); i += 4) {
+		xcorr_kernel_neon_float((float *)_x, (float *)_y+i,
+					(float *)xcorr+i, len);
+	}
+
+	/* In case max_pitch isn't multiple of 4, do unrolled version */
+	for (; i < max_pitch; i++) {
+		float sum = 0;
+		float *yi = _y+i;
+		for (j = 0; j < len; j++)
+			sum += _x[i]*yi[i];
+		xcorr[i] = sum;
+	}
+}
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index a07f8ac..f5adc48 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -52,6 +52,19 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
   ((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
 #  endif
 
-# endif
+#else /* Start !FIXED_POINT */
+/* Float case */
+#if defined(OPUS_ARM_NEON_INTR)
+void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
+				opus_val32 *xcorr, int len, int max_pitch, int arch);
+#endif
+
+#if !defined(OPUS_HAVE_RTCD) && defined(OPUS_PRESUME_ARM_NEON_INTR)
+#define OVERRIDE_PITCH_XCORR (1)
+#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+	((void)(arch),celt_pitch_xcorr_float_neon(_x, _y, xcorr, \
+						len, max_pitch, arch))
+#endif
+#endif	/*end of !FIXED_POINT*/
 
 #endif
diff --git a/celt/pitch.h b/celt/pitch.h
index 027ebd9..752b54e 100644
--- a/celt/pitch.h
+++ b/celt/pitch.h
@@ -46,7 +46,7 @@
 #include "mips/pitch_mipsr1.h"
 #endif
 
-#if defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
+#if defined(OPUS_ARM_ASM)
 # include "arm/pitch_arm.h"
 #endif
 
@@ -189,11 +189,22 @@ opus_val32
 void
 #  endif
 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
-      const opus_val16 *, opus_val32 *, int, int);
+      const opus_val16 *, opus_val32 *, int, int
+#if !defined(FIXED_POINT)
+	,int
+#endif
+);
 
+#if defined(FIXED_POINT)
+#  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+  ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
+        xcorr, len, max_pitch)
+#else
 #  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
   ((*CELT_PITCH_XCORR_IMPL[(arch)&OPUS_ARCHMASK])(_x, _y, \
-        xcorr, len, max_pitch))
+        xcorr, len, max_pitch, arch))
+#endif
+
 # else
 #  define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
   ((void)(arch),celt_pitch_xcorr_c(_x, _y, xcorr, len, max_pitch, arch))
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index 3076bbf..cdb2939 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -56,7 +56,7 @@
 #include "x86/celt_lpc_sse.c"
 #endif
 #include "x86/x86_celt_map.c"
-#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
+#elif defined(OPUS_ARM_ASM)
 #include "arm/arm_celt_map.c"
 #endif
 
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index 37ba74e..906fa7e 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -54,7 +54,7 @@
 #include "x86/celt_lpc_sse.c"
 #endif
 #include "x86/x86_celt_map.c"
-#elif defined(OPUS_ARM_ASM) && defined(FIXED_POINT)
+#elif defined(OPUS_ARM_ASM)
 #include "arm/arm_celt_map.c"
 #endif
 
diff --git a/celt_sources.mk b/celt_sources.mk
index 20b1b1b..3d4deca 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -30,5 +30,8 @@ celt/arm/arm_celt_map.c
 CELT_SOURCES_ARM_ASM = \
 celt/arm/celt_pitch_xcorr_arm.s
 
+CELT_SOURCES_ARM_NEON_INTR = \
+celt/arm/celt_neon_intr.c
+
 CELT_AM_SOURCES_ARM_ASM = \
 celt/arm/armopts.s.in
diff --git a/configure.ac b/configure.ac
index 9b2f51f..09657b6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -198,12 +198,11 @@ cpu_arm=no
 
 AS_IF([test x"${enable_asm}" = x"yes"],[
     inline_optimization="No ASM for your platform, please send patches"
+    OPUS_ARM_NEON_INTR_CPPFLAGS=
     case $host_cpu in
       arm*)
-        dnl Currently we only have asm for fixed-point
-        AS_IF([test "$enable_float" != "yes"],[
             cpu_arm=yes
-            AC_DEFINE([OPUS_ARM_ASM], [],  [Make use of ARM asm optimization])
+            AC_DEFINE([OPUS_ARM_ASM], [],  [Make use of ARM asm/intrinsic optimization])
             AS_GCC_INLINE_ASSEMBLY(
                 [inline_optimization="ARM"],
                 [inline_optimization="disabled"]
@@ -212,6 +211,35 @@ AS_IF([test x"${enable_asm}" = x"yes"],[
             AS_ASM_ARM_MEDIA([OPUS_ARM_INLINE_MEDIA=1],
                 [OPUS_ARM_INLINE_MEDIA=0])
             AS_ASM_ARM_NEON([OPUS_ARM_INLINE_NEON=1],[OPUS_ARM_INLINE_NEON=0])
+
+	    AC_ARG_ENABLE([arm-neon-intrinsics],
+		AS_HELP_STRING([--enable-arm-neon-intrinsics], [Enable NEON optimisations on ARM CPUs that support it]))
+
+	    AS_IF([test x"$enable_arm_neon_intrinsics" = x"yes"],
+		[
+	        AC_MSG_CHECKING(if compiler supports arm neon intrinsics)
+		save_CFLAGS="$CFLAGS"
+		save_CFLAGS="$CFLAGS"; CFLAGS="-mfpu=neon $CFLAGS"
+		AC_COMPILE_IFELSE(
+		    [AC_LANG_PROGRAM([[#include <arm_neon.h>]], [])],
+		    [
+			OPUS_ARM_NEON_INTR=1
+			OPUS_ARM_NEON_INTR_CPPFLAGS="-mfpu=neon -O3"
+			AC_SUBST(OPUS_ARM_NEON_INTR_CPPFLAGS)
+		    ],
+		    [
+			OPUS_ARM_NEON_INTR=0
+		    ])
+		CFLAGS="$save_CFLAGS"
+		AS_IF([test x"$OPUS_ARM_NEON_INTR"=x"1"],
+		    [AC_MSG_RESULT([yes])],
+		    [AC_MSG_RESULT([no])])
+		],
+		[
+		    OPUS_ARM_NEON_INTR=0
+		    AC_MSG_WARN([ARMv7 neon intrinsics not enabled])
+		])
+
             AS_IF([test x"$inline_optimization" = x"ARM"],[
                 AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],[true])
                 AC_DEFINE([OPUS_ARM_INLINE_ASM], 1,
@@ -220,7 +248,7 @@ AS_IF([test x"${enable_asm}" = x"yes"],[
                     AC_DEFINE([OPUS_ARM_INLINE_EDSP], [1],
                         [Use ARMv5E inline asm optimizations])
                     inline_optimization="$inline_optimization (EDSP)"
-                ])
+                ]n)
                 AS_IF([test x"$OPUS_ARM_INLINE_MEDIA" = x"1"],[
                     AC_DEFINE([OPUS_ARM_INLINE_MEDIA], [1],
                         [Use ARMv6 inline asm optimizations])
@@ -335,13 +363,20 @@ AS_IF([test x"${enable_asm}" = x"yes"],[
                   [*** ARM assembly requires perl -- disabling optimizations])
                 asm_optimization="(missing perl dependency for ARM)"
             ])
-        ])
+	   AS_IF([test x"$OPUS_ARM_NEON_INTR" = x"1"], [
+		AC_DEFINE([OPUS_ARM_NEON_INTR], 1,
+			[Compiler supports ARMv7 Neon Intrinsics]),
+		AS_IF([test x"$OPUS_ARM_PRESUME_NEON" = x"1"], [
+			AC_DEFINE([OPUS_PRESUME_NEON_INTR], 1,
+			    [Compiler support arm Intrinsics and target must support neon])],
+			[])
+		AS_IF([test x"enable_rtcd" != x""],
+			[rtcd_support="$rtcd_support (NEON_INTR)"],
+			[])
+		],[])
         ;;
     esac
-],[
-   inline_optimization="disabled"
-   asm_optimization="disabled"
-])
+],[])
 
 AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
 AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
@@ -349,6 +384,9 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM],
 AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
     [test x"${asm_optimization%% *}" = x"ARM"])
 
+AM_CONDITIONAL([OPUS_ARM_NEON_INTR],
+    [test x"$OPUS_ARM_NEON_INTR" = x"1"])
+
 AM_CONDITIONAL([HAVE_SSE4_1], [false])
 AM_CONDITIONAL([HAVE_SSE2], [false])
 AS_IF([test x"$enable_intrinsics" = x"yes"],[
diff --git a/silk/SigProc_FIX.h b/silk/SigProc_FIX.h
index b632994..4c03a33 100644
--- a/silk/SigProc_FIX.h
+++ b/silk/SigProc_FIX.h
@@ -595,11 +595,11 @@ static OPUS_INLINE opus_int64 silk_max_64(opus_int64 a, opus_int64 b)
 #include "MacroCount.h"
 #include "MacroDebug.h"
 
-#ifdef OPUS_ARM_INLINE_ASM
+#if defined(OPUS_ARM_INLINE_ASM) && defined(FIXED_POINT)
 #include "arm/SigProc_FIX_armv4.h"
 #endif
 
-#ifdef OPUS_ARM_INLINE_EDSP
+#if defined(OPUS_ARM_INLINE_EDSP) && defined(FIXED_POINT)
 #include "arm/SigProc_FIX_armv5e.h"
 #endif
 
diff --git a/silk/macros.h b/silk/macros.h
index 2f24950..217b851 100644
--- a/silk/macros.h
+++ b/silk/macros.h
@@ -138,11 +138,11 @@ static OPUS_INLINE opus_int32 silk_CLZ32(opus_int32 in32)
     (*((Matrix_base_adr) + ((row)+(M)*(column))))
 #endif
 
-#ifdef OPUS_ARM_INLINE_ASM
+#if defined(OPUS_ARM_INLINE_ASM) && defined(FIXED_POINT)
 #include "arm/macros_armv4.h"
 #endif
 
-#ifdef OPUS_ARM_INLINE_EDSP
+#if defined(OPUS_ARM_INLINE_EDSP) && defined(FIXED_POINT)
 #include "arm/macros_armv5e.h"
 #endif
 
-- 
1.7.9.5



More information about the opus mailing list