[opus] [RFC PATCH v1] arm: kf_bfly4: Introduce ARM neon intrinsics

Sun Nov 9 13:34:27 PST 2014

Optimize kf_bfly4 function using ARM NEON intrinsics
for SoCs that have NEON VFP unit

As initial step, only targetting ARMv7-VFP based SoCs.
To enable this optimization, use --enable-armv7-neon-float
when running configure command. This is disabled by default.
---
 Makefile.am              |   16 ++++
 celt/_kiss_fft_guts.h    |   13 +++
 celt/arm/kiss_fft_neon.c |  211 ++++++++++++++++++++++++++++++++++++++++++++++
 celt/arm/kiss_fft_neon.h |   37 ++++++++
 celt/kiss_fft.c          |    2 +-
 celt_headers.mk          |    1 +
 celt_sources.mk          |    3 +
 configure.ac             |   14 +++
 8 files changed, 296 insertions(+), 1 deletion(-)
 create mode 100644 celt/arm/kiss_fft_neon.c
 create mode 100644 celt/arm/kiss_fft_neon.h

diff --git a/Makefile.am b/Makefile.am
index e20f7b4..94e2419 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -38,6 +38,12 @@ CELT_SOURCES += $(CELT_SOURCES_SSE)
 endif
 endif
 
+if ARMv7_NEON_INTRINSICS_FLOAT
+noinst_LTLIBRARIES = libneon.la
+libneon_la_SOURCES = $(CELT_SOURCES_ARM_NEON)
+libneon_la_CPPFLAGS = -mfpu=neon-vfpv4 -O3 -I$(top_srcdir)/include
+endif
+
 if CPU_ARM
 CELT_SOURCES += $(CELT_SOURCES_ARM)
 SILK_SOURCES += $(SILK_SOURCES_ARM)
@@ -60,6 +66,10 @@ libopus_la_SOURCES = $(CELT_SOURCES) $(SILK_SOURCES) $(OPUS_SOURCES)
 libopus_la_LDFLAGS = -no-undefined -version-info @OPUS_LT_CURRENT@:@OPUS_LT_REVISION@:@OPUS_LT_AGE@
 libopus_la_LIBADD = $(LIBM)
 
+if ARMv7_NEON_INTRINSICS_FLOAT
+libopus_la_LIBADD += ./libneon.la
+endif
+
 pkginclude_HEADERS = include/opus.h include/opus_multistream.h include/opus_types.h include/opus_defines.h
 
 noinst_HEADERS = $(OPUS_HEAD) $(SILK_HEAD) $(CELT_HEAD)
@@ -97,6 +107,9 @@ celt_tests_test_unit_cwrs32_LDADD = $(LIBM)
 
 celt_tests_test_unit_dft_SOURCES = celt/tests/test_unit_dft.c
 celt_tests_test_unit_dft_LDADD = $(LIBM)
+if ARMv7_NEON_INTRINSICS_FLOAT
+celt_tests_test_unit_dft_LDADD += ./libneon.la
+endif
 
 celt_tests_test_unit_entropy_SOURCES = celt/tests/test_unit_entropy.c
 celt_tests_test_unit_entropy_LDADD = $(LIBM)
@@ -114,6 +127,9 @@ endif
 
 celt_tests_test_unit_mdct_SOURCES = celt/tests/test_unit_mdct.c
 celt_tests_test_unit_mdct_LDADD = $(LIBM)
+if ARMv7_NEON_INTRINSICS_FLOAT
+celt_tests_test_unit_mdct_LDADD += ./libneon.la
+endif
 
 celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c
 celt_tests_test_unit_rotation_LDADD = $(LIBM)
diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h
index 5e3d58f..219b431 100644
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -34,6 +34,19 @@
    and defines
    typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */
 #include "kiss_fft.h"
+#include "arch.h"
+/*
+void kf_bfly4_c( kiss_fft_cpx * Fout, const size_t fstride,
+		const kiss_fft_state *st,
+		int m, int N, int mm);
+*/
+
+#if defined (ARMv7_NEON_INTRINSICS_FLOAT)
+#include "arm/kiss_fft_neon.h"
+#define kf_bfly4 kf_bfly4_neon
+#else
+#define kf_bfly4 kf_bfly4_c
+#endif
 
 /*
   Explanation of macros dealing with complex math:
diff --git a/celt/arm/kiss_fft_neon.c b/celt/arm/kiss_fft_neon.c
new file mode 100644
index 0000000..a2ae42c
--- /dev/null
+++ b/celt/arm/kiss_fft_neon.c
@@ -0,0 +1,211 @@
+/* Copyright (c) 2014 Linaro Ltd
+   Written by Viswanath Puttagunta */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "../kiss_fft.h"
+#include <arm_neon.h>
+
+#define C_MUL_NEON(m, a, b, t, ones, tv) \
+        do{ \
+        t = vrev64q_f32(b); \
+        m = vmulq_f32(a, b); \
+        m = vmulq_f32(m, ones); \
+        t = vmulq_f32(a, t); \
+        tv = vtrnq_f32(m, t); \
+        m = vaddq_f32(tv.val[0], tv.val[1]); \
+        }while(0)
+
+#define ONES_MINUS_ONE  0xbf8000003f800000 //{-1.0, 1.0}
+#define MINUS_ONE       0xbf800000bf800000 // {-1.0, -1.0}
+
+static void kf_bfly4_neon_m1(kiss_fft_cpx *Fout, int N) {
+	float32x4_t Fout_4[2];
+	float32x2_t Fout_2[4];
+	float32x2_t scratch_2[2];
+	float32x2_t ones_2 = vcreate_f32(ONES_MINUS_ONE);
+	float32x2_t minusones_2 = vcreate_f32(MINUS_ONE);
+	float *ai = (float *)Fout;
+	float *bi = (float *)Fout;
+	int i;
+
+	/* Consume/update 4 complex Fout values per cycle
+	 * just like normal C code, except each neon
+	 * instruction consumes 1 complex number (2 floats)
+	 * In theory, one could use Q regs instead of
+	 * D regs, but you need to consider case when N is odd
+	 * One can do that if it justifies performance improment
+	 */
+
+	for (i = 0; i < N; i++) {
+		Fout_4[0] = vld1q_f32(ai);
+		ai += 4;
+		Fout_4[1] = vld1q_f32(ai);
+		ai += 4;
+		Fout_2[0] = vget_low_f32(Fout_4[0]);
+		Fout_2[1] = vget_high_f32(Fout_4[0]);
+		Fout_2[2] = vget_low_f32(Fout_4[1]);
+		Fout_2[3] = vget_high_f32(Fout_4[1]);
+
+		scratch_2[0] = vsub_f32(Fout_2[0], Fout_2[2]);
+		Fout_2[0] = vadd_f32(Fout_2[0], Fout_2[2]);
+		scratch_2[1] = vadd_f32(Fout_2[1], Fout_2[3]);
+		Fout_2[2] = vsub_f32(Fout_2[0], scratch_2[1]);
+		Fout_2[0] = vadd_f32(Fout_2[0], scratch_2[1]);
+		scratch_2[1] = vsub_f32(Fout_2[1], Fout_2[3]);
+
+		scratch_2[1] = vrev64_f32(scratch_2[1]);
+		/* scratch_2[1] *= (1, -1) */
+		scratch_2[1] = vmul_f32(scratch_2[1], ones_2);
+		Fout_2[1] = vadd_f32(scratch_2[0], scratch_2[1]);
+
+		/* scratch_2[1] *= (-1, -1) */
+		scratch_2[1] = vmul_f32(scratch_2[1], minusones_2);
+		Fout_2[3] = vadd_f32(scratch_2[0], scratch_2[1]);
+
+		Fout_4[0] = vcombine_f32(Fout_2[0], Fout_2[1]);
+		Fout_4[1] = vcombine_f32(Fout_2[2], Fout_2[3]);
+
+		vst1q_f32(bi, Fout_4[0]);
+		bi += 4;
+		vst1q_f32(bi, Fout_4[1]);
+		bi += 4;
+	}
+}
+
+static void kf_bfly4_neon_m8(kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm) {
+	int i, j;
+	float32x4_t scratch[6];
+	float32x4_t Fout_4[4];
+	float32x2_t ones_2 = vcreate_f32(ONES_MINUS_ONE);
+	float32x2_t minusones_2 = vcreate_f32(MINUS_ONE);
+	float32x4_t ones = vcombine_f32(ones_2, ones_2);
+	float32x4_t minusones = vcombine_f32(minusones_2, minusones_2);
+	float32x4_t t;
+	float32x4x2_t tv;
+	float *tw1, *tw2, *tw3;
+	float *tw1_2, *tw2_2, *tw3_2;
+	int fstride_2 = 2*fstride;
+	int fs_tw1 = 2*fstride_2;
+	int fs_tw2 = 4*fstride_2;
+	int fs_tw3 = 6*fstride_2;
+	int fs_x = 3*fstride_2;
+	const int m1 = 2*m;
+	const int m2 = 4*m;	// 2*(2*m)
+	const int m3 = 6*m;	// 3*(2*m)
+	kiss_fft_cpx *Fout_beg = Fout;
+	float32x4_t tw[3];
+	float32x2_t tw_2[6];
+	float *ai;
+
+	/* m is guaranteed to be a multiple of 4
+	 * however, this function will function properly
+	 * so long as m is a multiple of 2
+	 */
+	celt_assert((m%2 == 0));
+
+	for (i = 0; i < N; i++) {
+		Fout = Fout_beg + i*mm;
+		ai = (float *) Fout;
+		tw1 = tw2 = tw3 = (float *)st->twiddles;
+		tw1_2 = tw1 + fstride_2;
+		tw2_2 = tw1 + fs_tw1;		//fstride_2*2;
+		tw3_2 = tw1 + fs_x;		//fstride_2*3;
+
+		/* In each cycle, left 2 lanes of q regs have data
+		 * corresponding to m even and right 2 lanes have
+		 * data corresponding to m odd.
+		 * So, instead of consuming/updating 4 complex
+		 * values of Fout per cycle, we consume/update 8
+		 * complex values of Fout
+		 */
+		for (j = 0; j < m/2; j++) {
+			Fout_4[0] = vld1q_f32(ai);
+			Fout_4[1] = vld1q_f32(ai+m1);
+			Fout_4[2] = vld1q_f32(ai+m2);
+			Fout_4[3] = vld1q_f32(ai+m3);
+
+			tw_2[0] = vld1_f32(tw1); tw_2[1] = vld1_f32(tw1_2);
+			tw[0] = vcombine_f32(tw_2[0], tw_2[1]);
+			tw1 += fs_tw1;
+			tw1_2 += fs_tw1;
+
+			tw_2[2] = vld1_f32(tw2); tw_2[3] = vld1_f32(tw2_2);
+			tw[1] = vcombine_f32(tw_2[2], tw_2[3]);
+			tw2 += fs_tw2;
+			tw2_2 += fs_tw2;
+
+			tw_2[4] = vld1_f32(tw3); tw_2[5] = vld1_f32(tw3_2);
+			tw[2] = vcombine_f32(tw_2[4], tw_2[5]);
+			tw3 += fs_tw3;
+			tw3_2 += fs_tw3;
+
+			C_MUL_NEON(scratch[0], Fout_4[1], tw[0], t, ones, tv);
+			C_MUL_NEON(scratch[1], Fout_4[2], tw[1], t, ones, tv);
+			C_MUL_NEON(scratch[2], Fout_4[3], tw[2], t, ones, tv);
+
+			scratch[5] = vsubq_f32(Fout_4[0], scratch[1]);
+			Fout_4[0] = vaddq_f32(Fout_4[0], scratch[1]);
+			scratch[3] = vaddq_f32(scratch[0], scratch[2]);
+			scratch[4] = vsubq_f32(scratch[0], scratch[2]);
+			Fout_4[2] = vsubq_f32(Fout_4[0], scratch[3]);
+			Fout_4[0] = vaddq_f32(Fout_4[0], scratch[3]);
+
+			scratch[4] = vrev64q_f32(scratch[4]);
+			scratch[4] = vmulq_f32(scratch[4], ones);
+			Fout_4[1] = vaddq_f32(scratch[5], scratch[4]);
+			scratch[4] = vmulq_f32(scratch[4], minusones);
+			Fout_4[3] = vaddq_f32(scratch[5], scratch[4]);
+
+			vst1q_f32(ai, Fout_4[0]);
+			vst1q_f32(ai+m1, Fout_4[1]);
+			vst1q_f32(ai+m2, Fout_4[2]);
+			vst1q_f32(ai+m3, Fout_4[3]);
+
+			ai += 4;
+		}
+	}
+}
+
+/* NEON Implementation of kf_bfly_c */
+void kf_bfly4_neon(kiss_fft_cpx * Fout,
+                     const size_t fstride,
+                     const kiss_fft_state *st,
+                     int m,
+                     int N,
+                     int mm) {
+	if (m == 1) {
+		/* Degenerate case whre all twiddles are 1 */
+		kf_bfly4_neon_m1(Fout, N);
+	} else {
+		/* m is guaranteed to be a multiple of 4 */
+		kf_bfly4_neon_m8(Fout, fstride, st, m, N, mm);
+	}
+}
diff --git a/celt/arm/kiss_fft_neon.h b/celt/arm/kiss_fft_neon.h
new file mode 100644
index 0000000..b332e87
--- /dev/null
+++ b/celt/arm/kiss_fft_neon.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2014 Linaro Ltd
+   Written by Viswanath Puttagunta */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef KISS_FFT_NEON_H
+#define KISS_FFT_NEON_H
+
+#include "../kiss_fft.h"
+
+/* NEON Implementation of kf_bfly_c */
+void kf_bfly4_neon( kiss_fft_cpx * Fout, const size_t fstride,
+                                const kiss_fft_state *st,
+                                int m, int N, int mm);
+#endif
diff --git a/celt/kiss_fft.c b/celt/kiss_fft.c
index cc487fc..5c81f78 100644
--- a/celt/kiss_fft.c
+++ b/celt/kiss_fft.c
@@ -101,7 +101,7 @@ static void kf_bfly2(
    }
 }
 
-static void kf_bfly4(
+void kf_bfly4_c(
                      kiss_fft_cpx * Fout,
                      const size_t fstride,
                      const kiss_fft_state *st,
diff --git a/celt_headers.mk b/celt_headers.mk
index 5bb193e..db4c2f3 100644
--- a/celt_headers.mk
+++ b/celt_headers.mk
@@ -37,5 +37,6 @@ celt/arm/fixed_armv5e.h \
 celt/arm/kiss_fft_armv4.h \
 celt/arm/kiss_fft_armv5e.h \
 celt/arm/pitch_arm.h \
+celt/arm/kiss_fft_neon.h \
 celt/x86/pitch_sse.h \
 celt/x86/x86cpu.h
diff --git a/celt_sources.mk b/celt_sources.mk
index 20b1b1b..b27bf3e 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -32,3 +32,6 @@ celt/arm/celt_pitch_xcorr_arm.s
 
 CELT_AM_SOURCES_ARM_ASM = \
 celt/arm/armopts.s.in
+
+CELT_SOURCES_ARM_NEON = \
+celt/arm/kiss_fft_neon.c
diff --git a/configure.ac b/configure.ac
index 9b2f51f..488295f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -193,6 +193,20 @@ AC_ARG_ENABLE([intrinsics],
     [AS_HELP_STRING([--enable-intrinsics], [Enable intrinsics optimizations (only for fixed point x86)])],,
     [enable_intrinsics=no])
 
+AC_ARG_ENABLE([armv7-neon-intrinsics],
+    [AS_HELP_STRING([--enable-armv7-neon-intrinsics], [Enable intrinsics optimizations for ARMv7 NEON VFP])],,
+    [enable_armv7_neon_intrinsics=no])
+
+AS_IF([test "${enable_armv7_neon_intrinsics}" = "yes"],
+  [
+   AC_CHECK_HEADERS([arm_neon.h])
+   AC_DEFINE([ARMv7_NEON_INTRINSICS_FLOAT], [1], [Use ARMv7 NEON Intrinsics])
+   AM_CONDITIONAL([ARMv7_NEON_INTRINSICS_FLOAT],[true])
+  ],
+  [
+   AM_CONDITIONAL([ARMv7_NEON_INTRINSICS_FLOAT],[false])
+  ])
+
 rtcd_support=no
 cpu_arm=no
 
-- 
1.7.9.5