[opus] [RFC PATCH v1] arm: kf_bfly4: Introduce ARM neon intrinsics
Viswanath Puttagunta
viswanath.puttagunta at linaro.org
Sun Nov 9 13:34:27 PST 2014
Optimize kf_bfly4 function using ARM NEON intrinsics
for SoCs that have NEON VFP unit
As initial step, only targetting ARMv7-VFP based SoCs.
To enable this optimization, use --enable-armv7-neon-float
when running configure command. This is disabled by default.
---
Makefile.am | 16 ++++
celt/_kiss_fft_guts.h | 13 +++
celt/arm/kiss_fft_neon.c | 211 ++++++++++++++++++++++++++++++++++++++++++++++
celt/arm/kiss_fft_neon.h | 37 ++++++++
celt/kiss_fft.c | 2 +-
celt_headers.mk | 1 +
celt_sources.mk | 3 +
configure.ac | 14 +++
8 files changed, 296 insertions(+), 1 deletion(-)
create mode 100644 celt/arm/kiss_fft_neon.c
create mode 100644 celt/arm/kiss_fft_neon.h
diff --git a/Makefile.am b/Makefile.am
index e20f7b4..94e2419 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -38,6 +38,12 @@ CELT_SOURCES += $(CELT_SOURCES_SSE)
endif
endif
+if ARMv7_NEON_INTRINSICS_FLOAT
+noinst_LTLIBRARIES = libneon.la
+libneon_la_SOURCES = $(CELT_SOURCES_ARM_NEON)
+libneon_la_CPPFLAGS = -mfpu=neon-vfpv4 -O3 -I$(top_srcdir)/include
+endif
+
if CPU_ARM
CELT_SOURCES += $(CELT_SOURCES_ARM)
SILK_SOURCES += $(SILK_SOURCES_ARM)
@@ -60,6 +66,10 @@ libopus_la_SOURCES = $(CELT_SOURCES) $(SILK_SOURCES) $(OPUS_SOURCES)
libopus_la_LDFLAGS = -no-undefined -version-info @OPUS_LT_CURRENT@:@OPUS_LT_REVISION@:@OPUS_LT_AGE@
libopus_la_LIBADD = $(LIBM)
+if ARMv7_NEON_INTRINSICS_FLOAT
+libopus_la_LIBADD += ./libneon.la
+endif
+
pkginclude_HEADERS = include/opus.h include/opus_multistream.h include/opus_types.h include/opus_defines.h
noinst_HEADERS = $(OPUS_HEAD) $(SILK_HEAD) $(CELT_HEAD)
@@ -97,6 +107,9 @@ celt_tests_test_unit_cwrs32_LDADD = $(LIBM)
celt_tests_test_unit_dft_SOURCES = celt/tests/test_unit_dft.c
celt_tests_test_unit_dft_LDADD = $(LIBM)
+if ARMv7_NEON_INTRINSICS_FLOAT
+celt_tests_test_unit_dft_LDADD += ./libneon.la
+endif
celt_tests_test_unit_entropy_SOURCES = celt/tests/test_unit_entropy.c
celt_tests_test_unit_entropy_LDADD = $(LIBM)
@@ -114,6 +127,9 @@ endif
celt_tests_test_unit_mdct_SOURCES = celt/tests/test_unit_mdct.c
celt_tests_test_unit_mdct_LDADD = $(LIBM)
+if ARMv7_NEON_INTRINSICS_FLOAT
+celt_tests_test_unit_mdct_LDADD += ./libneon.la
+endif
celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c
celt_tests_test_unit_rotation_LDADD = $(LIBM)
diff --git a/celt/_kiss_fft_guts.h b/celt/_kiss_fft_guts.h
index 5e3d58f..219b431 100644
--- a/celt/_kiss_fft_guts.h
+++ b/celt/_kiss_fft_guts.h
@@ -34,6 +34,19 @@
and defines
typedef struct { kiss_fft_scalar r; kiss_fft_scalar i; }kiss_fft_cpx; */
#include "kiss_fft.h"
+#include "arch.h"
+/*
+void kf_bfly4_c( kiss_fft_cpx * Fout, const size_t fstride,
+ const kiss_fft_state *st,
+ int m, int N, int mm);
+*/
+
+#if defined (ARMv7_NEON_INTRINSICS_FLOAT)
+#include "arm/kiss_fft_neon.h"
+#define kf_bfly4 kf_bfly4_neon
+#else
+#define kf_bfly4 kf_bfly4_c
+#endif
/*
Explanation of macros dealing with complex math:
diff --git a/celt/arm/kiss_fft_neon.c b/celt/arm/kiss_fft_neon.c
new file mode 100644
index 0000000..a2ae42c
--- /dev/null
+++ b/celt/arm/kiss_fft_neon.c
@@ -0,0 +1,211 @@
+/* Copyright (c) 2014 Linaro Ltd
+ Written by Viswanath Puttagunta */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "../kiss_fft.h"
+#include <arm_neon.h>
+
+#define C_MUL_NEON(m, a, b, t, ones, tv) \
+ do{ \
+ t = vrev64q_f32(b); \
+ m = vmulq_f32(a, b); \
+ m = vmulq_f32(m, ones); \
+ t = vmulq_f32(a, t); \
+ tv = vtrnq_f32(m, t); \
+ m = vaddq_f32(tv.val[0], tv.val[1]); \
+ }while(0)
+
+#define ONES_MINUS_ONE 0xbf8000003f800000 //{-1.0, 1.0}
+#define MINUS_ONE 0xbf800000bf800000 // {-1.0, -1.0}
+
+static void kf_bfly4_neon_m1(kiss_fft_cpx *Fout, int N) {
+ float32x4_t Fout_4[2];
+ float32x2_t Fout_2[4];
+ float32x2_t scratch_2[2];
+ float32x2_t ones_2 = vcreate_f32(ONES_MINUS_ONE);
+ float32x2_t minusones_2 = vcreate_f32(MINUS_ONE);
+ float *ai = (float *)Fout;
+ float *bi = (float *)Fout;
+ int i;
+
+ /* Consume/update 4 complex Fout values per cycle
+ * just like normal C code, except each neon
+ * instruction consumes 1 complex number (2 floats)
+ * In theory, one could use Q regs instead of
+ * D regs, but you need to consider case when N is odd
+ * One can do that if it justifies performance improment
+ */
+
+ for (i = 0; i < N; i++) {
+ Fout_4[0] = vld1q_f32(ai);
+ ai += 4;
+ Fout_4[1] = vld1q_f32(ai);
+ ai += 4;
+ Fout_2[0] = vget_low_f32(Fout_4[0]);
+ Fout_2[1] = vget_high_f32(Fout_4[0]);
+ Fout_2[2] = vget_low_f32(Fout_4[1]);
+ Fout_2[3] = vget_high_f32(Fout_4[1]);
+
+ scratch_2[0] = vsub_f32(Fout_2[0], Fout_2[2]);
+ Fout_2[0] = vadd_f32(Fout_2[0], Fout_2[2]);
+ scratch_2[1] = vadd_f32(Fout_2[1], Fout_2[3]);
+ Fout_2[2] = vsub_f32(Fout_2[0], scratch_2[1]);
+ Fout_2[0] = vadd_f32(Fout_2[0], scratch_2[1]);
+ scratch_2[1] = vsub_f32(Fout_2[1], Fout_2[3]);
+
+ scratch_2[1] = vrev64_f32(scratch_2[1]);
+ /* scratch_2[1] *= (1, -1) */
+ scratch_2[1] = vmul_f32(scratch_2[1], ones_2);
+ Fout_2[1] = vadd_f32(scratch_2[0], scratch_2[1]);
+
+ /* scratch_2[1] *= (-1, -1) */
+ scratch_2[1] = vmul_f32(scratch_2[1], minusones_2);
+ Fout_2[3] = vadd_f32(scratch_2[0], scratch_2[1]);
+
+ Fout_4[0] = vcombine_f32(Fout_2[0], Fout_2[1]);
+ Fout_4[1] = vcombine_f32(Fout_2[2], Fout_2[3]);
+
+ vst1q_f32(bi, Fout_4[0]);
+ bi += 4;
+ vst1q_f32(bi, Fout_4[1]);
+ bi += 4;
+ }
+}
+
+static void kf_bfly4_neon_m8(kiss_fft_cpx * Fout,
+ const size_t fstride,
+ const kiss_fft_state *st,
+ int m,
+ int N,
+ int mm) {
+ int i, j;
+ float32x4_t scratch[6];
+ float32x4_t Fout_4[4];
+ float32x2_t ones_2 = vcreate_f32(ONES_MINUS_ONE);
+ float32x2_t minusones_2 = vcreate_f32(MINUS_ONE);
+ float32x4_t ones = vcombine_f32(ones_2, ones_2);
+ float32x4_t minusones = vcombine_f32(minusones_2, minusones_2);
+ float32x4_t t;
+ float32x4x2_t tv;
+ float *tw1, *tw2, *tw3;
+ float *tw1_2, *tw2_2, *tw3_2;
+ int fstride_2 = 2*fstride;
+ int fs_tw1 = 2*fstride_2;
+ int fs_tw2 = 4*fstride_2;
+ int fs_tw3 = 6*fstride_2;
+ int fs_x = 3*fstride_2;
+ const int m1 = 2*m;
+ const int m2 = 4*m; // 2*(2*m)
+ const int m3 = 6*m; // 3*(2*m)
+ kiss_fft_cpx *Fout_beg = Fout;
+ float32x4_t tw[3];
+ float32x2_t tw_2[6];
+ float *ai;
+
+ /* m is guaranteed to be a multiple of 4
+ * however, this function will function properly
+ * so long as m is a multiple of 2
+ */
+ celt_assert((m%2 == 0));
+
+ for (i = 0; i < N; i++) {
+ Fout = Fout_beg + i*mm;
+ ai = (float *) Fout;
+ tw1 = tw2 = tw3 = (float *)st->twiddles;
+ tw1_2 = tw1 + fstride_2;
+ tw2_2 = tw1 + fs_tw1; //fstride_2*2;
+ tw3_2 = tw1 + fs_x; //fstride_2*3;
+
+ /* In each cycle, left 2 lanes of q regs have data
+ * corresponding to m even and right 2 lanes have
+ * data corresponding to m odd.
+ * So, instead of consuming/updating 4 complex
+ * values of Fout per cycle, we consume/update 8
+ * complex values of Fout
+ */
+ for (j = 0; j < m/2; j++) {
+ Fout_4[0] = vld1q_f32(ai);
+ Fout_4[1] = vld1q_f32(ai+m1);
+ Fout_4[2] = vld1q_f32(ai+m2);
+ Fout_4[3] = vld1q_f32(ai+m3);
+
+ tw_2[0] = vld1_f32(tw1); tw_2[1] = vld1_f32(tw1_2);
+ tw[0] = vcombine_f32(tw_2[0], tw_2[1]);
+ tw1 += fs_tw1;
+ tw1_2 += fs_tw1;
+
+ tw_2[2] = vld1_f32(tw2); tw_2[3] = vld1_f32(tw2_2);
+ tw[1] = vcombine_f32(tw_2[2], tw_2[3]);
+ tw2 += fs_tw2;
+ tw2_2 += fs_tw2;
+
+ tw_2[4] = vld1_f32(tw3); tw_2[5] = vld1_f32(tw3_2);
+ tw[2] = vcombine_f32(tw_2[4], tw_2[5]);
+ tw3 += fs_tw3;
+ tw3_2 += fs_tw3;
+
+ C_MUL_NEON(scratch[0], Fout_4[1], tw[0], t, ones, tv);
+ C_MUL_NEON(scratch[1], Fout_4[2], tw[1], t, ones, tv);
+ C_MUL_NEON(scratch[2], Fout_4[3], tw[2], t, ones, tv);
+
+ scratch[5] = vsubq_f32(Fout_4[0], scratch[1]);
+ Fout_4[0] = vaddq_f32(Fout_4[0], scratch[1]);
+ scratch[3] = vaddq_f32(scratch[0], scratch[2]);
+ scratch[4] = vsubq_f32(scratch[0], scratch[2]);
+ Fout_4[2] = vsubq_f32(Fout_4[0], scratch[3]);
+ Fout_4[0] = vaddq_f32(Fout_4[0], scratch[3]);
+
+ scratch[4] = vrev64q_f32(scratch[4]);
+ scratch[4] = vmulq_f32(scratch[4], ones);
+ Fout_4[1] = vaddq_f32(scratch[5], scratch[4]);
+ scratch[4] = vmulq_f32(scratch[4], minusones);
+ Fout_4[3] = vaddq_f32(scratch[5], scratch[4]);
+
+ vst1q_f32(ai, Fout_4[0]);
+ vst1q_f32(ai+m1, Fout_4[1]);
+ vst1q_f32(ai+m2, Fout_4[2]);
+ vst1q_f32(ai+m3, Fout_4[3]);
+
+ ai += 4;
+ }
+ }
+}
+
+/* NEON Implementation of kf_bfly_c */
+void kf_bfly4_neon(kiss_fft_cpx * Fout,
+ const size_t fstride,
+ const kiss_fft_state *st,
+ int m,
+ int N,
+ int mm) {
+ if (m == 1) {
+ /* Degenerate case whre all twiddles are 1 */
+ kf_bfly4_neon_m1(Fout, N);
+ } else {
+ /* m is guaranteed to be a multiple of 4 */
+ kf_bfly4_neon_m8(Fout, fstride, st, m, N, mm);
+ }
+}
diff --git a/celt/arm/kiss_fft_neon.h b/celt/arm/kiss_fft_neon.h
new file mode 100644
index 0000000..b332e87
--- /dev/null
+++ b/celt/arm/kiss_fft_neon.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2014 Linaro Ltd
+ Written by Viswanath Puttagunta */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef KISS_FFT_NEON_H
+#define KISS_FFT_NEON_H
+
+#include "../kiss_fft.h"
+
+/* NEON Implementation of kf_bfly_c */
+void kf_bfly4_neon( kiss_fft_cpx * Fout, const size_t fstride,
+ const kiss_fft_state *st,
+ int m, int N, int mm);
+#endif
diff --git a/celt/kiss_fft.c b/celt/kiss_fft.c
index cc487fc..5c81f78 100644
--- a/celt/kiss_fft.c
+++ b/celt/kiss_fft.c
@@ -101,7 +101,7 @@ static void kf_bfly2(
}
}
-static void kf_bfly4(
+void kf_bfly4_c(
kiss_fft_cpx * Fout,
const size_t fstride,
const kiss_fft_state *st,
diff --git a/celt_headers.mk b/celt_headers.mk
index 5bb193e..db4c2f3 100644
--- a/celt_headers.mk
+++ b/celt_headers.mk
@@ -37,5 +37,6 @@ celt/arm/fixed_armv5e.h \
celt/arm/kiss_fft_armv4.h \
celt/arm/kiss_fft_armv5e.h \
celt/arm/pitch_arm.h \
+celt/arm/kiss_fft_neon.h \
celt/x86/pitch_sse.h \
celt/x86/x86cpu.h
diff --git a/celt_sources.mk b/celt_sources.mk
index 20b1b1b..b27bf3e 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -32,3 +32,6 @@ celt/arm/celt_pitch_xcorr_arm.s
CELT_AM_SOURCES_ARM_ASM = \
celt/arm/armopts.s.in
+
+CELT_SOURCES_ARM_NEON = \
+celt/arm/kiss_fft_neon.c
diff --git a/configure.ac b/configure.ac
index 9b2f51f..488295f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -193,6 +193,20 @@ AC_ARG_ENABLE([intrinsics],
[AS_HELP_STRING([--enable-intrinsics], [Enable intrinsics optimizations (only for fixed point x86)])],,
[enable_intrinsics=no])
+AC_ARG_ENABLE([armv7-neon-intrinsics],
+ [AS_HELP_STRING([--enable-armv7-neon-intrinsics], [Enable intrinsics optimizations for ARMv7 NEON VFP])],,
+ [enable_armv7_neon_intrinsics=no])
+
+AS_IF([test "${enable_armv7_neon_intrinsics}" = "yes"],
+ [
+ AC_CHECK_HEADERS([arm_neon.h])
+ AC_DEFINE([ARMv7_NEON_INTRINSICS_FLOAT], [1], [Use ARMv7 NEON Intrinsics])
+ AM_CONDITIONAL([ARMv7_NEON_INTRINSICS_FLOAT],[true])
+ ],
+ [
+ AM_CONDITIONAL([ARMv7_NEON_INTRINSICS_FLOAT],[false])
+ ])
+
rtcd_support=no
cpu_arm=no
--
1.7.9.5
More information about the opus
mailing list