[opus] [PATCH 2/2] Optimize fixed-point celt_fir_c() for ARM NEON
Linfeng Zhang
linfengz at google.com
Fri Jun 17 21:09:17 UTC 2016
Create the fixed-point intrinsics optimization celt_fir_neon() for ARM NEON.
Create test celt/tests/test_unit_optimization_lpc to unit test the optimization.
---
.gitignore | 2 +
Makefile.am | 17 ++-
celt/arm/arm_celt_map.c | 17 +++
celt/arm/celt_lpc_arm.h | 65 ++++++++
celt/arm/celt_lpc_neon_intr.c | 254 ++++++++++++++++++++++++++++++++
celt/celt_lpc.h | 5 +
celt/tests/test_unit_dft.c | 1 +
celt/tests/test_unit_mathops.c | 1 +
celt/tests/test_unit_mdct.c | 1 +
celt/tests/test_unit_optimization_lpc.c | 121 +++++++++++++++
celt/tests/test_unit_rotation.c | 1 +
celt_headers.mk | 1 +
celt_sources.mk | 1 +
13 files changed, 482 insertions(+), 5 deletions(-)
create mode 100644 celt/arm/celt_lpc_arm.h
create mode 100644 celt/arm/celt_lpc_neon_intr.c
create mode 100644 celt/tests/test_unit_optimization_lpc.c
diff --git a/.gitignore b/.gitignore
index 33127c9..6d3b48b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,6 +20,7 @@ install-sh
.deps
.libs
.dirstamp
+.project
*.a
*.exe
*.la
@@ -57,6 +58,7 @@ celt/tests/test_unit_entropy
celt/tests/test_unit_laplace
celt/tests/test_unit_mathops
celt/tests/test_unit_mdct
+celt/tests/test_unit_optimization_lpc
celt/tests/test_unit_rotation
celt/tests/test_unit_types
doc/doxygen_sqlite3.db
diff --git a/Makefile.am b/Makefile.am
index cfdaced..e06d687 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -83,9 +83,9 @@ pkginclude_HEADERS = include/opus.h include/opus_multistream.h include/opus_type
noinst_HEADERS = $(OPUS_HEAD) $(SILK_HEAD) $(CELT_HEAD)
if EXTRA_PROGRAMS
-noinst_PROGRAMS = opus_demo repacketizer_demo opus_compare tests/test_opus_api tests/test_opus_encode tests/test_opus_decode tests/test_opus_padding celt/tests/test_unit_cwrs32 celt/tests/test_unit_dft celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_mathops celt/tests/test_unit_mdct celt/tests/test_unit_rotation celt/tests/test_unit_types
+noinst_PROGRAMS = opus_demo repacketizer_demo opus_compare tests/test_opus_api tests/test_opus_encode tests/test_opus_decode tests/test_opus_padding celt/tests/test_unit_cwrs32 celt/tests/test_unit_dft celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_mathops celt/tests/test_unit_mdct celt/tests/test_unit_optimization_lpc celt/tests/test_unit_rotation celt/tests/test_unit_types
-TESTS = celt/tests/test_unit_types celt/tests/test_unit_mathops celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_dft celt/tests/test_unit_mdct celt/tests/test_unit_rotation celt/tests/test_unit_cwrs32 tests/test_opus_api tests/test_opus_decode tests/test_opus_encode tests/test_opus_padding
+TESTS = celt/tests/test_unit_types celt/tests/test_unit_mathops celt/tests/test_unit_entropy celt/tests/test_unit_laplace celt/tests/test_unit_dft celt/tests/test_unit_mdct celt/tests/test_unit_optimization_lpc celt/tests/test_unit_rotation celt/tests/test_unit_cwrs32 tests/test_opus_api tests/test_opus_decode tests/test_opus_encode tests/test_opus_padding
opus_demo_SOURCES = src/opus_demo.c
@@ -137,6 +137,12 @@ if OPUS_ARM_EXTERNAL_ASM
celt_tests_test_unit_mdct_LDADD += libarmasm.la
endif
+celt_tests_test_unit_optimization_lpc_SOURCES = celt/tests/test_unit_optimization_lpc.c
+celt_tests_test_unit_optimization_lpc_LDADD = $(NE10_LIBS) $(LIBM)
+if OPUS_ARM_EXTERNAL_ASM
+celt_tests_test_unit_optimization_lpc_LDADD += libarmasm.la
+endif
+
celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c
celt_tests_test_unit_rotation_LDADD = $(NE10_LIBS) $(LIBM)
if OPUS_ARM_EXTERNAL_ASM
@@ -272,10 +278,11 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): $(top_srcdir)/celt/arm/arm2gnu.pl
%-gnu.S: %.s
$(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@
-OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \
- $(celt_tests_test_unit_rotation_SOURCES:.c=.o) \
+OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_dft_SOURCES:.c=.o) \
+ $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \
$(celt_tests_test_unit_mdct_SOURCES:.c=.o) \
- $(celt_tests_test_unit_dft_SOURCES:.c=.o)
+ $(celt_tests_test_unit_optimization_lpc_SOURCES:.c=.o) \
+ $(celt_tests_test_unit_rotation_SOURCES:.c=.o)
if HAVE_SSE
SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo)
diff --git a/celt/arm/arm_celt_map.c b/celt/arm/arm_celt_map.c
index ee6c244..4e90fde 100644
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@@ -29,6 +29,7 @@
#include "config.h"
#endif
+#include "celt_lpc.h"
#include "pitch.h"
#include "kiss_fft.h"
#include "mdct.h"
@@ -36,6 +37,22 @@
#if defined(OPUS_HAVE_RTCD)
# if defined(FIXED_POINT)
+void celt_fir_neon(
+ const opus_val16 *_x,
+ const opus_val16 *num,
+ opus_val16 *_y,
+ int N,
+ int ord,
+ int arch);
+
+void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+ const opus_val16 *, opus_val16 *, int, int, int) = {
+ celt_fir_c, /* ARMv4 */
+ celt_fir_c, /* EDSP */
+ celt_fir_c, /* Media */
+ MAY_HAVE_NEON(celt_fir) /* NEON */
+};
+
opus_val32 (*const CELT_PITCH_XCORR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
const opus_val16 *, opus_val32 *, int , int) = {
celt_pitch_xcorr_c, /* ARMv4 */
diff --git a/celt/arm/celt_lpc_arm.h b/celt/arm/celt_lpc_arm.h
new file mode 100644
index 0000000..101df3d
--- /dev/null
+++ b/celt/arm/celt_lpc_arm.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#if !defined(CELT_LPC_ARM_H)
+# define CELT_LPC_ARM_H
+
+# include "armcpu.h"
+
+# if defined(FIXED_POINT)
+
+# if defined(OPUS_ARM_MAY_HAVE_NEON)
+void celt_fir_neon(
+ const opus_val16 *_x,
+ const opus_val16 *num,
+ opus_val16 *_y,
+ int N,
+ int ord,
+ int arch);
+# endif
+
+# if !defined(OPUS_HAVE_RTCD)
+# define OVERRIDE_CELT_FIR (1)
+# define celt_fir(x, num, y, N, ord, arch) \
+ ((void)(arch),PRESUME_NEON(celt_fir)(x, num, y, N, ord, arch))
+# endif
+
+#if !defined(OVERRIDE_CELT_FIR)
+/*Is run-time CPU detection enabled on this platform?*/
+# if defined(OPUS_HAVE_RTCD) && (defined(OPUS_ARM_ASM) \
+ || (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) \
+ && !defined(OPUS_ARM_PRESUME_NEON_INTR)))
+extern void (*const CELT_FIR_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *,
+ const opus_val16 *, opus_val16 *, int, int, int);
+
+# define OVERRIDE_CELT_FIR
+# define celt_fir(x, num, y, N, ord, arch) \
+ ((*CELT_FIR_IMPL[(arch)&OPUS_ARCHMASK])(x, num, y, N, ord, arch))
+# endif
+#endif
+#endif /* end FIXED_POINT */
+
+#endif /* end CELT_LPC_ARM_H */
diff --git a/celt/arm/celt_lpc_neon_intr.c b/celt/arm/celt_lpc_neon_intr.c
new file mode 100644
index 0000000..4715d0b
--- /dev/null
+++ b/celt/arm/celt_lpc_neon_intr.c
@@ -0,0 +1,254 @@
+/* Copyright (c) 2016 Google Inc. */
+/**
+ @file celt_lpc_neon_intr.c
+ @brief ARM Neon Intrinsic optimizations for celt lpc functions
+ */
+
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <arm_neon.h>
+#include "celt_lpc.h"
+#include "stack_alloc.h"
+
+#if defined(FIXED_POINT)
+
+void celt_fir_neon(
+ const opus_val16 *_x,
+ const opus_val16 *num,
+ opus_val16 *y,
+ int N,
+ int ord,
+ int arch)
+{
+ int i,j;
+ const int leftover = N & 7;
+ const opus_val16 *x = _x-ord;
+ VARDECL(opus_val16, rnum);
+ SAVE_STACK;
+ /* Extend rnum by 3 zeros to handle the case that (ord % 4) is non-zero. */
+ ALLOC(rnum, ord+3, opus_val16);
+ for (i=0;i<ord-3;i+=4)
+ vst1_s16(rnum+i, vrev64_s16(vld1_s16(num+ord-i-4)));
+ for (;i<ord;i++)
+ rnum[i] = num[ord-i-1];
+ rnum[ord] = rnum[ord+1] = rnum[ord+2] = 0;
+ (void)arch;
+
+#ifdef SMALL_FOOTPRINT
+ for (i=0;i<N-7;i+=8)
+ {
+ int16x8_t x_s16x8 = vld1q_s16(_x+i);
+ int32x4_t sum0_s32x4 = vshll_n_s16(vget_low_s16 (x_s16x8), SIG_SHIFT);
+ int32x4_t sum1_s32x4 = vshll_n_s16(vget_high_s16(x_s16x8), SIG_SHIFT);
+ for (j=0;j<ord;j+=4)
+ {
+ const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+ x_s16x8 = vld1q_s16(x+i+j+0);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0);
+ x_s16x8 = vld1q_s16(x+i+j+1);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1);
+ x_s16x8 = vld1q_s16(x+i+j+2);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2);
+ x_s16x8 = vld1q_s16(x+i+j+3);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3);
+ }
+ vst1q_s16(y+i, vcombine_s16(vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT), vqrshrn_n_s32(sum1_s32x4, SIG_SHIFT)));
+ }
+ if (leftover)
+ {
+ if (leftover > 4)
+ {
+ int16x8_t x_s16x8 = vld1q_s16(_x+i);
+ int32x4_t sum0_s32x4 = vshll_n_s16(vget_low_s16 (x_s16x8), SIG_SHIFT);
+ int32x4_t sum1_s32x4 = vshll_n_s16(vget_high_s16(x_s16x8), SIG_SHIFT);
+ for (j=0;j<ord;j+=4)
+ {
+ const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+ x_s16x8 = vld1q_s16(x+i+j+0);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0);
+ x_s16x8 = vld1q_s16(x+i+j+1);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1);
+ x_s16x8 = vld1q_s16(x+i+j+2);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2);
+ x_s16x8 = vld1q_s16(x+i+j+3);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3);
+ }
+ const int16x8_t y_s16x8 = vcombine_s16(vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT), vqrshrn_n_s32(sum1_s32x4, SIG_SHIFT));
+ vst1_s16(y+i, vget_low_s16(y_s16x8));
+ vst1q_lane_s16(y+i+4, y_s16x8, 4);
+ if (leftover >= 6)
+ {
+ vst1q_lane_s16(y+i+5, y_s16x8, 5);
+ if (leftover == 7)
+ {
+ vst1q_lane_s16(y+i+6, y_s16x8, 6);
+ }
+ }
+ }
+ else {
+ int32x4_t sum0_s32x4 = vshll_n_s16(vld1_s16(_x+i), SIG_SHIFT);
+ for (j=0;j<ord;j+=4)
+ {
+ const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+0), rnum_s16x4, 0);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+1), rnum_s16x4, 1);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+2), rnum_s16x4, 2);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+3), rnum_s16x4, 3);
+ }
+ const int16x4_t y_s16x4 = vqrshrn_n_s32(sum0_s32x4, SIG_SHIFT);
+ if (leftover == 4)
+ {
+ vst1_s16(y+i, y_s16x4);
+ }
+ else
+ {
+ vst1_lane_s16(y+i, y_s16x4, 0);
+ if (leftover >= 2)
+ {
+ vst1_lane_s16(y+i+1, y_s16x4, 1);
+ if (leftover == 3)
+ {
+ vst1_lane_s16(y+i+2, y_s16x4, 2);
+ }
+ }
+ }
+ }
+ }
+#else
+ for (i=0;i<N-7;i+=8)
+ {
+ int32x4_t sum0_s32x4, sum1_s32x4;
+ sum0_s32x4 = sum1_s32x4 = vdupq_n_s32(0);
+ for (j=0;j<ord;j+=4)
+ {
+ const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+ int16x8_t x_s16x8 = vld1q_s16(x+i+j+0);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0);
+ x_s16x8 = vld1q_s16(x+i+j+1);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1);
+ x_s16x8 = vld1q_s16(x+i+j+2);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2);
+ x_s16x8 = vld1q_s16(x+i+j+3);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3);
+ }
+ sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT);
+ sum1_s32x4 = vrshrq_n_s32(sum1_s32x4, SIG_SHIFT);
+ const int16x8_t x_s16x8 = vld1q_s16(_x+i);
+ sum0_s32x4 = vaddw_s16(sum0_s32x4, vget_low_s16 (x_s16x8));
+ sum1_s32x4 = vaddw_s16(sum1_s32x4, vget_high_s16(x_s16x8));
+ vst1q_s16(y+i, vcombine_s16(vqmovn_s32(sum0_s32x4), vqmovn_s32(sum1_s32x4)));
+ }
+ if (leftover)
+ {
+ if (leftover > 4)
+ {
+ int32x4_t sum0_s32x4, sum1_s32x4;
+ sum0_s32x4 = sum1_s32x4 = vdupq_n_s32(0);
+ for (j=0;j<ord;j+=4)
+ {
+ const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+ int16x8_t x_s16x8 = vld1q_s16(x+i+j+0);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 0);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 0);
+ x_s16x8 = vld1q_s16(x+i+j+1);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 1);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 1);
+ x_s16x8 = vld1q_s16(x+i+j+2);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 2);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 2);
+ x_s16x8 = vld1q_s16(x+i+j+3);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vget_low_s16 (x_s16x8), rnum_s16x4, 3);
+ sum1_s32x4 = vmlal_lane_s16(sum1_s32x4, vget_high_s16(x_s16x8), rnum_s16x4, 3);
+ }
+ sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT);
+ sum1_s32x4 = vrshrq_n_s32(sum1_s32x4, SIG_SHIFT);
+ const int16x8_t x_s16x8 = vld1q_s16(_x+i);
+ sum0_s32x4 = vaddw_s16(sum0_s32x4, vget_low_s16 (x_s16x8));
+ sum1_s32x4 = vaddw_s16(sum1_s32x4, vget_high_s16(x_s16x8));
+ const int16x8_t y_s16x8 = vcombine_s16(vqmovn_s32(sum0_s32x4), vqmovn_s32(sum1_s32x4));
+ vst1_s16(y+i, vget_low_s16(y_s16x8));
+ vst1q_lane_s16(y+i+4, y_s16x8, 4);
+ if (leftover >= 6)
+ {
+ vst1q_lane_s16(y+i+5, y_s16x8, 5);
+ if (leftover == 7)
+ {
+ vst1q_lane_s16(y+i+6, y_s16x8, 6);
+ }
+ }
+ }
+ else {
+ int32x4_t sum0_s32x4 = vdupq_n_s32(0);
+ for (j=0;j<ord;j+=4)
+ {
+ const int16x4_t rnum_s16x4 = vld1_s16(rnum+j);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+0), rnum_s16x4, 0);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+1), rnum_s16x4, 1);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+2), rnum_s16x4, 2);
+ sum0_s32x4 = vmlal_lane_s16(sum0_s32x4, vld1_s16(x+i+j+3), rnum_s16x4, 3);
+ }
+ sum0_s32x4 = vrshrq_n_s32(sum0_s32x4, SIG_SHIFT);
+ sum0_s32x4 = vaddw_s16(sum0_s32x4, vld1_s16(_x+i));
+ const int16x4_t y_s16x4 = vqmovn_s32(sum0_s32x4);
+ if (leftover == 4)
+ {
+ vst1_s16(y+i, y_s16x4);
+ }
+ else
+ {
+ vst1_lane_s16(y+i, y_s16x4, 0);
+ if (leftover >= 2)
+ {
+ vst1_lane_s16(y+i+1, y_s16x4, 1);
+ if (leftover == 3)
+ {
+ vst1_lane_s16(y+i+2, y_s16x4, 2);
+ }
+ }
+ }
+ }
+ }
+#endif
+ RESTORE_STACK;
+}
+
+#endif
diff --git a/celt/celt_lpc.h b/celt/celt_lpc.h
index a4c5fd6..76a73c0 100644
--- a/celt/celt_lpc.h
+++ b/celt/celt_lpc.h
@@ -35,6 +35,11 @@
#include "x86/celt_lpc_sse.h"
#endif
+#if ((defined(OPUS_ARM_ASM) && defined(FIXED_POINT)) \
+ || defined(OPUS_ARM_MAY_HAVE_NEON_INTR))
+#include "arm/celt_lpc_arm.h"
+#endif
+
#define LPC_ORDER 24
void _celt_lpc(opus_val16 *_lpc, const opus_val32 *ac, int p);
diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
index 6166eb0..582618e 100644
--- a/celt/tests/test_unit_dft.c
+++ b/celt/tests/test_unit_dft.c
@@ -52,6 +52,7 @@
# include "celt_lpc.c"
# include "pitch.c"
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# include "arm/celt_lpc_neon_intr.c"
# include "arm/celt_neon_intr.c"
# if defined(HAVE_ARM_NE10)
# include "mdct.c"
diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c
index fd3319d..da92f16 100644
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@@ -66,6 +66,7 @@
#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# include "arm/armcpu.c"
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# include "arm/celt_lpc_neon_intr.c"
# include "arm/celt_neon_intr.c"
# if defined(HAVE_ARM_NE10)
# include "kiss_fft.c"
diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
index 8dbb9ca..0658c7a 100644
--- a/celt/tests/test_unit_mdct.c
+++ b/celt/tests/test_unit_mdct.c
@@ -53,6 +53,7 @@
# include "pitch.c"
# include "celt_lpc.c"
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# include "arm/celt_lpc_neon_intr.c"
# include "arm/celt_neon_intr.c"
# if defined(HAVE_ARM_NE10)
# include "arm/celt_ne10_fft.c"
diff --git a/celt/tests/test_unit_optimization_lpc.c b/celt/tests/test_unit_optimization_lpc.c
new file mode 100644
index 0000000..146526e
--- /dev/null
+++ b/celt/tests/test_unit_optimization_lpc.c
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 Google Inc. */
+/*
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+ OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#define SKIP_CONFIG_H
+
+#ifndef CUSTOM_MODES
+#define CUSTOM_MODES
+#endif
+
+#include <stdio.h>
+
+#define CELT_C
+#include "stack_alloc.h"
+#include "mathops.c"
+#include "entcode.c"
+
+#ifdef FIXED_POINT
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
+# include "celt_lpc.c"
+# include "pitch.c"
+# include "x86/x86cpu.c"
+# include "x86/celt_lpc_sse.c"
+# include "x86/pitch_sse2.c"
+# include "x86/pitch_sse4_1.c"
+# include "x86/x86_celt_map.c"
+#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# include "celt_lpc.c"
+# include "pitch.c"
+# include "arm/armcpu.c"
+# include "arm/arm_celt_map.c"
+# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# include "arm/celt_lpc_neon_intr.c"
+# endif
+#endif
+
+#define MAX_ORDER 32
+
+void test_fir(int arch)
+{
+ opus_val16 x[MAX_PERIOD+MAX_ORDER];
+ opus_val16 num[MAX_ORDER];
+ opus_val16 yorg[MAX_PERIOD], yopt[MAX_PERIOD];
+ int N, ord;
+
+ unsigned int i;
+ for(ord=0;ord<=MAX_ORDER;ord++)
+ {
+ printf("ord=%2d", ord);
+ for(N=ord;N<=MAX_PERIOD;N++) /* N is larger than or equal to ord. */
+ {
+ for (i=0;i<MAX_PERIOD+MAX_ORDER;++i)
+ {
+ x[i] = (rand() % 32767) - 16384;
+ }
+ for (i=0;i<MAX_PERIOD;++i)
+ {
+ yorg[i] = (rand() % 32767) - 16384;
+ }
+ for (i=0;i<MAX_ORDER;++i)
+ {
+ num[i] = (rand() % 32767) - 16384;
+ }
+ memcpy(yopt, yorg, sizeof(yorg));
+
+ celt_fir_c(x+MAX_ORDER, num, yorg, N, ord, arch);
+ celt_fir (x+MAX_ORDER, num, yopt, N, ord, arch);
+ if (memcmp(yorg, yopt, sizeof(yorg)))
+ {
+ printf(" N=%3d failed!\nError in lpc unit test!!!\n", N);
+ for (i=0;i<sizeof(yorg) / sizeof(*yorg);i++)
+ {
+ if (yorg[i] != yopt[i])
+ {
+ printf("yorg[%3d]=%d, yopt[%3d]=%d\n", i, yorg[i], i, yopt[i]);
+ }
+ }
+ exit(0);
+ }
+ }
+ printf(" passed!\n");
+ }
+}
+#endif /* FIXED_POINT */
+
+int main(void)
+{
+#ifdef FIXED_POINT
+ ALLOC_STACK;
+ int arch = opus_select_arch();
+ test_fir(arch);
+#endif /* FIXED_POINT */
+ return 0;
+}
diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c
index 1080c20..3a85a29 100644
--- a/celt/tests/test_unit_rotation.c
+++ b/celt/tests/test_unit_rotation.c
@@ -64,6 +64,7 @@
#elif defined(OPUS_ARM_ASM) || defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
# include "arm/armcpu.c"
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
+# include "arm/celt_lpc_neon_intr.c"
# include "arm/celt_neon_intr.c"
# if defined(HAVE_ARM_NE10)
# include "kiss_fft.c"
diff --git a/celt_headers.mk b/celt_headers.mk
index 0eca6e6..4067e5a 100644
--- a/celt_headers.mk
+++ b/celt_headers.mk
@@ -34,6 +34,7 @@ celt/static_modes_fixed.h \
celt/static_modes_float_arm_ne10.h \
celt/static_modes_fixed_arm_ne10.h \
celt/arm/armcpu.h \
+celt/arm/celt_lpc_arm.h \
celt/arm/fixed_armv4.h \
celt/arm/fixed_armv5e.h \
celt/arm/kiss_fft_armv4.h \
diff --git a/celt_sources.mk b/celt_sources.mk
index 2ffe99a..37c0129 100644
--- a/celt_sources.mk
+++ b/celt_sources.mk
@@ -37,6 +37,7 @@ CELT_AM_SOURCES_ARM_ASM = \
celt/arm/armopts.s.in
CELT_SOURCES_ARM_NEON_INTR = \
+celt/arm/celt_lpc_neon_intr.c \
celt/arm/celt_neon_intr.c
CELT_SOURCES_ARM_NE10= \
--
2.8.0.rc3.226.g39d4020
More information about the opus
mailing list