[opus] [RFC PATCH v1 2/2] armv7(float): Optimize encode usecase using NE10 library

Viswanath Puttagunta viswanath.puttagunta at linaro.org
Fri Jan 23 06:53:59 PST 2015


On 20 January 2015 at 11:37, Viswanath Puttagunta
<viswanath.puttagunta at linaro.org> wrote:
>
> Optimize opus encode (float only) usecase using ARM NE10
> library. Mainly effects opus_fft and ctl_mdct_forward
> and related functions.
>
> This optimization can be used for ARM CPUs that have NEON
> VFP unit. This patch only enables optimizations for ARMv7.
>
> Official ARM NE10 library page available at
> http://projectne10.github.io/Ne10/
>
> To enable this optimization, use
> --enable-intrinsics --with-NE10=<install_prefix>
> or
> --enable-intrinsics --with-NE10-libraries=<NE10_lib_dir> --with-NE10-includes=<NE10_includes_dir>
>
> Compile time checks made during configure process to make sure
> optimization option available only when compiler supports NEON
> instrinsics.
>
> Runtime checks made to make sure optimized functions only called
> on appropriate hardware.
> ---
>  Makefile.am                          |   30 +--
>  celt/arm/arm_celt_ne10_fft_map.c     |   65 ++++++
>  celt/arm/arm_celt_ne10_mdct_map.c    |   53 +++++
>  celt/arm/celt_ne10_fft.c             |  101 ++++++++++
>  celt/arm/celt_ne10_mdct.c            |  159 +++++++++++++++
>  celt/arm/fft_arm.h                   |   65 ++++++
>  celt/arm/mdct_arm.h                  |   52 +++++
>  celt/celt_encoder.c                  |    4 +-
>  celt/dump_modes/Makefile             |   21 +-
>  celt/dump_modes/dump_mode_arm_ne10.c |  103 ++++++++++
>  celt/dump_modes/dump_modes.c         |   22 +-
>  celt/dump_modes/dump_modes_arch.h    |   14 ++
>  celt/kiss_fft.c                      |   18 +-
>  celt/kiss_fft.h                      |   44 +++-
>  celt/mdct.c                          |    2 +-
>  celt/mdct.h                          |   29 ++-
>  celt/static_modes_float.h            |   25 +++
>  celt/static_modes_float_arm_ne10.h   |  367 ++++++++++++++++++++++++++++++++++
>  celt/tests/test_unit_dft.c           |   14 +-
>  celt/tests/test_unit_mdct.c          |   19 +-
>  celt_headers.mk                      |    3 +
>  celt_sources.mk                      |    6 +
>  configure.ac                         |   81 ++++++++
>  src/analysis.c                       |    2 +-
>  src/opus_multistream_encoder.c       |    3 +-
>  25 files changed, 1278 insertions(+), 24 deletions(-)
>  create mode 100644 celt/arm/arm_celt_ne10_fft_map.c
>  create mode 100644 celt/arm/arm_celt_ne10_mdct_map.c
>  create mode 100644 celt/arm/celt_ne10_fft.c
>  create mode 100644 celt/arm/celt_ne10_mdct.c
>  create mode 100644 celt/arm/fft_arm.h
>  create mode 100644 celt/arm/mdct_arm.h
>  create mode 100644 celt/dump_modes/dump_mode_arm_ne10.c
>  create mode 100644 celt/dump_modes/dump_modes_arch.h
>  create mode 100644 celt/static_modes_float_arm_ne10.h
>
> diff --git a/Makefile.am b/Makefile.am
> index 95323ca..5ad93aa 100644
> --- a/Makefile.am
> +++ b/Makefile.am
> @@ -10,7 +10,7 @@ lib_LTLIBRARIES = libopus.la
>  DIST_SUBDIRS = doc
>
>  AM_CPPFLAGS = -I$(top_srcdir)/include -I$(top_srcdir)/celt -I$(top_srcdir)/silk \
> -              -I$(top_srcdir)/silk/float -I$(top_srcdir)/silk/fixed
> +              -I$(top_srcdir)/silk/float -I$(top_srcdir)/silk/fixed $(NE10_CFLAGS)
>
>  include celt_sources.mk
>  include silk_sources.mk
> @@ -47,6 +47,10 @@ CELT_SOURCES += $(CELT_SOURCES_ARM_NEON_INTR)
>  OPUS_ARM_NEON_INTR_CPPFLAGS = -mfpu=neon
>  endif
>
> +if HAVE_ARM_NE10
> +CELT_SOURCES += $(CELT_SOURCES_ARM_NE10)
> +endif
> +
>  if OPUS_ARM_EXTERNAL_ASM
>  nodist_libopus_la_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S)
>  BUILT_SOURCES = $(CELT_SOURCES_ARM_ASM:.s=-gnu.S) \
> @@ -64,7 +68,7 @@ include opus_headers.mk
>
>  libopus_la_SOURCES = $(CELT_SOURCES) $(SILK_SOURCES) $(OPUS_SOURCES)
>  libopus_la_LDFLAGS = -no-undefined -version-info @OPUS_LT_CURRENT@:@OPUS_LT_REVISION@:@OPUS_LT_AGE@
> -libopus_la_LIBADD = $(LIBM)
> +libopus_la_LIBADD = $(NE10_LIBS) $(LIBM)
>
>  pkginclude_HEADERS = include/opus.h include/opus_multistream.h include/opus_types.h include/opus_defines.h
>
> @@ -77,32 +81,32 @@ TESTS = celt/tests/test_unit_types celt/tests/test_unit_mathops celt/tests/test_
>
>  opus_demo_SOURCES = src/opus_demo.c
>
> -opus_demo_LDADD = libopus.la $(LIBM)
> +opus_demo_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
>
>  repacketizer_demo_SOURCES = src/repacketizer_demo.c
>
> -repacketizer_demo_LDADD = libopus.la $(LIBM)
> +repacketizer_demo_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
>
>  opus_compare_SOURCES = src/opus_compare.c
>  opus_compare_LDADD = $(LIBM)
>
>  tests_test_opus_api_SOURCES = tests/test_opus_api.c tests/test_opus_common.h
> -tests_test_opus_api_LDADD = libopus.la $(LIBM)
> +tests_test_opus_api_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
>
>  tests_test_opus_encode_SOURCES = tests/test_opus_encode.c tests/test_opus_common.h
> -tests_test_opus_encode_LDADD = libopus.la $(LIBM)
> +tests_test_opus_encode_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
>
>  tests_test_opus_decode_SOURCES = tests/test_opus_decode.c tests/test_opus_common.h
> -tests_test_opus_decode_LDADD = libopus.la $(LIBM)
> +tests_test_opus_decode_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
>
>  tests_test_opus_padding_SOURCES = tests/test_opus_padding.c tests/test_opus_common.h
> -tests_test_opus_padding_LDADD = libopus.la $(LIBM)
> +tests_test_opus_padding_LDADD = libopus.la $(NE10_LIBS) $(LIBM)
>
>  celt_tests_test_unit_cwrs32_SOURCES = celt/tests/test_unit_cwrs32.c
>  celt_tests_test_unit_cwrs32_LDADD = $(LIBM)
>
>  celt_tests_test_unit_dft_SOURCES = celt/tests/test_unit_dft.c
> -celt_tests_test_unit_dft_LDADD = $(LIBM)
> +celt_tests_test_unit_dft_LDADD = $(NE10_LIBS) $(LIBM)
>
>  celt_tests_test_unit_entropy_SOURCES = celt/tests/test_unit_entropy.c
>  celt_tests_test_unit_entropy_LDADD = $(LIBM)
> @@ -119,7 +123,7 @@ endif
>  endif
>
>  celt_tests_test_unit_mdct_SOURCES = celt/tests/test_unit_mdct.c
> -celt_tests_test_unit_mdct_LDADD = $(LIBM)
> +celt_tests_test_unit_mdct_LDADD = $(NE10_LIBS) $(LIBM)
>
>  celt_tests_test_unit_rotation_SOURCES = celt/tests/test_unit_rotation.c
>  celt_tests_test_unit_rotation_LDADD = $(LIBM)
> @@ -269,6 +273,8 @@ endif
>
>  if OPUS_ARM_NEON_INTR
>  CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \
> -                       %test_unit_rotation.o %test_unit_mathops.o
> -$(CELT_ARM_NEON_INTR_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CPPFLAGS)
> +                        $(CELT_SOURCES_ARM_NE10:.c=.lo) \
> +                                      %test_unit_rotation.o %test_unit_mathops.o \
> +                        %test_unit_mdct.o %test_unit_dft.o
> +$(CELT_ARM_NEON_INTR_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CPPFLAGS) $(NE10_CFLAGS)
>  endif
> diff --git a/celt/arm/arm_celt_ne10_fft_map.c b/celt/arm/arm_celt_ne10_fft_map.c
> new file mode 100644
> index 0000000..5bb7b5f
> --- /dev/null
> +++ b/celt/arm/arm_celt_ne10_fft_map.c
> @@ -0,0 +1,65 @@
> +/* Copyright (c) 2015-2016 Xiph.Org Foundation
> +   Written by Viswanath Puttagunta */
> +/**
> +   @file arm_celt_ne10_fft_map.c
> +   @brief Map for ARM Neon optimizations for fft using NE10
> + */
> +
> +/*
> +   Redistribution and use in source and binary forms, with or without
> +   modification, are permitted provided that the following conditions
> +   are met:
> +
> +   - Redistributions of source code must retain the above copyright
> +   notice, this list of conditions and the following disclaimer.
> +
> +   - Redistributions in binary form must reproduce the above copyright
> +   notice, this list of conditions and the following disclaimer in the
> +   documentation and/or other materials provided with the distribution.
> +
> +   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> +   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> +   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> +   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
> +   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
> +   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
> +   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> +   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> +   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> +   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> +   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +*/
> +
> +#ifdef HAVE_CONFIG_H
> +#include "config.h"
> +#endif
> +
> +#include "kiss_fft.h"
> +
> +#if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10)
> +#ifdef CUSTOM_MODES
> +int (*const OPUS_FFT_ALLOC_ARCH[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
> +   opus_fft_alloc_arch_c,           /* ARMv4 */
> +   opus_fft_alloc_arch_c,           /* EDSP */
> +   opus_fft_alloc_arch_c,           /* Media */
> +   opus_fft_alloc_arm_float_neon    /* Neon with NE10 library support */
> +};
> +
> +void (*const OPUS_FFT_FREE_ARCH[OPUS_ARCHMASK+1])(kiss_fft_state *st) = {
> +   opus_fft_free_arch_c,              /* ARMv4 */
> +   opus_fft_free_arch_c,              /* EDSP */
> +   opus_fft_free_arch_c,              /* Media */
> +   opus_fft_free_arm_float_neon       /* Neon with NE10 */
> +};
> +#endif /* CUSTOM_MODES */
> +
> +void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
> +                                       const kiss_fft_cpx *fin,
> +                                       kiss_fft_cpx *fout) = {
> +   opus_fft_c,                   /* ARMv4 */
> +   opus_fft_c,                   /* EDSP */
> +   opus_fft_c,                   /* Media */
> +   opus_fft_float_neon           /* Neon with NE10 */
> +};
> +
> +#endif
> diff --git a/celt/arm/arm_celt_ne10_mdct_map.c b/celt/arm/arm_celt_ne10_mdct_map.c
> new file mode 100644
> index 0000000..6df7af3
> --- /dev/null
> +++ b/celt/arm/arm_celt_ne10_mdct_map.c
> @@ -0,0 +1,53 @@
> +/* Copyright (c) 2015-2016 Xiph.Org Foundation
> +   Written by Viswanath Puttagunta */
> +/**
> +   @file arm_celt_ne10_mdct_map.c
> +   @brief Map for ARM Neon optimizations for mdct using NE10
> + */
> +
> +/*
> +   Redistribution and use in source and binary forms, with or without
> +   modification, are permitted provided that the following conditions
> +   are met:
> +
> +   - Redistributions of source code must retain the above copyright
> +   notice, this list of conditions and the following disclaimer.
> +
> +   - Redistributions in binary form must reproduce the above copyright
> +   notice, this list of conditions and the following disclaimer in the
> +   documentation and/or other materials provided with the distribution.
> +
> +   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> +   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> +   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> +   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
> +   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
> +   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
> +   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> +   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> +   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> +   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> +   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +*/
> +
> +#ifdef HAVE_CONFIG_H
> +#include "config.h"
> +#endif
> +
> +#include "kiss_fft.h"
> +#include "mdct.h"
> +
> +#if defined(OPUS_HAVE_RTCD) && defined(HAVE_ARM_NE10)
> +
> +void (*const CLT_MDCT_FORWARD_ARCH[OPUS_ARCHMASK+1])(const mdct_lookup *l,
> +                                      kiss_fft_scalar *in,
> +                                      kiss_fft_scalar * OPUS_RESTRICT out,
> +                                      const opus_val16 *window,
> +                                      int overlap, int shift, int stride) = {
> +   clt_mdct_forward_c,           /* ARMv4 */
> +   clt_mdct_forward_c,           /* EDSP */
> +   clt_mdct_forward_c,           /* Media */
> +   clt_mdct_forward_float_neon   /* Neon with NE10 */
> +};
> +
> +#endif
> diff --git a/celt/arm/celt_ne10_fft.c b/celt/arm/celt_ne10_fft.c
> new file mode 100644
> index 0000000..fe00b25
> --- /dev/null
> +++ b/celt/arm/celt_ne10_fft.c
> @@ -0,0 +1,101 @@
> +/* Copyright (c) 2015-2016 Xiph.Org Foundation
> +   Written by Viswanath Puttagunta */
> +/**
> +   @file celt_ne10_fft.c
> +   @brief ARM Neon optimizations for fft using NE10 library
> + */
> +
> +/*
> +   Redistribution and use in source and binary forms, with or without
> +   modification, are permitted provided that the following conditions
> +   are met:
> +
> +   - Redistributions of source code must retain the above copyright
> +   notice, this list of conditions and the following disclaimer.
> +
> +   - Redistributions in binary form must reproduce the above copyright
> +   notice, this list of conditions and the following disclaimer in the
> +   documentation and/or other materials provided with the distribution.
> +
> +   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> +   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> +   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> +   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
> +   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
> +   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
> +   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> +   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> +   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> +   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> +   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +*/
> +
> +#ifndef SKIP_CONFIG_H
> +#ifdef HAVE_CONFIG_H
> +#include "config.h"
> +#endif
> +#endif
> +
> +#include <arm_neon.h>
> +#include <NE10_init.h>
> +#include <NE10_dsp.h>
> +#include "../kiss_fft.h"
> +#include "stack_alloc.h"
> +#include "os_support.h"
> +#include "stack_alloc.h"
> +
> +#ifdef CUSTOM_MODES
> +
> +int opus_fft_alloc_arm_float_neon(kiss_fft_state *st)
> +{
> +   st->priv = (void *)ne10_fft_alloc_c2c_float32_neon(st->nfft);
> +   if (st->priv == NULL) {
> +      printf("Unable to ne10 alloc\n");
> +      return -1;
> +   }
> +   return 0;
> +}
> +
> +void opus_fft_free_arm_float_neon(kiss_fft_state *st)
> +{
> +   ne10_fft_cfg_float32_t cfg = (ne10_fft_cfg_float32_t)st->priv;
> +
> +   if (cfg)
> +      free((void *)cfg);
> +}
> +#endif
> +
> +void opus_fft_float_neon(const kiss_fft_state *st,
> +                        const kiss_fft_cpx *fin,
> +                        kiss_fft_cpx *fout)
> +{
> +   ne10_fft_cfg_float32_t cfg = (ne10_fft_cfg_float32_t)st->priv;
> +   VARDECL(ne10_fft_cpx_float32_t, temp);
> +   VARDECL(ne10_fft_cpx_float32_t, tempin);
> +   SAVE_STACK;
> +   int N2 = st->nfft >> 1;
> +   float32x4_t inq, outq;
> +   float32x2_t scale;
> +   float *in = (float *)fin;
> +   float *out;
> +   int i;
> +   ALLOC(temp, st->nfft, ne10_fft_cpx_float32_t);
> +   ALLOC(tempin, st->nfft, ne10_fft_cpx_float32_t);
> +
> +   out = (float *)tempin;
> +   scale = vld1_dup_f32(&st->scale);
> +   for (i = 0; i < N2; i++) {
> +      inq = vld1q_f32(in);
> +      in += 4;
> +      outq = vmulq_lane_f32(inq, scale, 0);
> +      vst1q_f32(out, outq);
> +      out += 4;
> +   }
> +
> +   cfg->buffer = (ne10_fft_cpx_float32_t *)&temp[0];
Speaking of being thread safe.. I don't think this is thread safe.. I
think what will work here is:

ne10_fft_state_float32_t state;
memcpy((void *)&state, st->priv, sizeof(ne10_fft_state_float32_t);
state.buffer = (ne10_fft_cpx_float32_t *)&temp[0];
ne10_fft_c2c_1d_float32_neon((ne10_fft_cpx_float32_t *)fout,
                                 (ne10_fft_cpx_float32_t *)tempin,
                                &state, 0);
I will correct this in RFCv2 after I receive comprehensive review.
> +
> +   ne10_fft_c2c_1d_float32_neon((ne10_fft_cpx_float32_t *)fout,
> +                                 (ne10_fft_cpx_float32_t *)tempin,
> +                                 cfg, 0);
> +   RESTORE_STACK;
> +}
> diff --git a/celt/arm/celt_ne10_mdct.c b/celt/arm/celt_ne10_mdct.c
> new file mode 100644
> index 0000000..177bda2
> --- /dev/null
> +++ b/celt/arm/celt_ne10_mdct.c
> @@ -0,0 +1,159 @@
> +/* Copyright (c) 2015-2016 Xiph.Org Foundation
> +   Written by Viswanath Puttagunta */
> +/**
> +   @file celt_ne10_mdct.c
> +   @brief ARM Neon optimizations for mdct using NE10 library
> + */
> +
> +/*
> +   Redistribution and use in source and binary forms, with or without
> +   modification, are permitted provided that the following conditions
> +   are met:
> +
> +   - Redistributions of source code must retain the above copyright
> +   notice, this list of conditions and the following disclaimer.
> +
> +   - Redistributions in binary form must reproduce the above copyright
> +   notice, this list of conditions and the following disclaimer in the
> +   documentation and/or other materials provided with the distribution.
> +
> +   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> +   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> +   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> +   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
> +   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
> +   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
> +   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> +   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> +   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> +   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> +   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +*/
> +
> +#ifndef SKIP_CONFIG_H
> +#ifdef HAVE_CONFIG_H
> +#include "config.h"
> +#endif
> +#endif
> +
> +#include "../kiss_fft.h"
> +#include "_kiss_fft_guts.h"
> +#include "../mdct.h"
> +#include "stack_alloc.h"
> +#include "os_support.h"
> +#include "stack_alloc.h"
> +
> +void clt_mdct_forward_float_neon(const mdct_lookup *l,
> +                                 kiss_fft_scalar *in,
> +                                 kiss_fft_scalar * OPUS_RESTRICT out,
> +                                 const opus_val16 *window,
> +                                 int overlap, int shift, int stride)
> +{
> +   int i;
> +   int N, N2, N4;
> +   VARDECL(kiss_fft_scalar, f);
> +   VARDECL(kiss_fft_cpx, f2);
> +   const kiss_fft_state *st = l->kfft[shift];
> +   const kiss_twiddle_scalar *trig;
> +
> +   SAVE_STACK;
> +
> +   N = l->n;
> +   trig = l->trig;
> +   for (i=0;i<shift;i++)
> +   {
> +      N >>= 1;
> +      trig += N;
> +   }
> +   N2 = N>>1;
> +   N4 = N>>2;
> +
> +   ALLOC(f, N2, kiss_fft_scalar);
> +   ALLOC(f2, N4, kiss_fft_cpx);
> +
> +   /* Consider the input to be composed of four blocks: [a, b, c, d] */
> +   /* Window, shuffle, fold */
> +   {
> +      /* Temp pointers to make it really clear to the compiler what we're doing */
> +      const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
> +      const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
> +      kiss_fft_scalar * OPUS_RESTRICT yp = f;
> +      const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
> +      const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
> +      for(i=0;i<((overlap+3)>>2);i++)
> +      {
> +         /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
> +         *yp++ = MULT16_32_Q15(*wp2, xp1[N2]) + MULT16_32_Q15(*wp1,*xp2);
> +         *yp++ = MULT16_32_Q15(*wp1, *xp1)    - MULT16_32_Q15(*wp2, xp2[-N2]);
> +         xp1+=2;
> +         xp2-=2;
> +         wp1+=2;
> +         wp2-=2;
> +      }
> +      wp1 = window;
> +      wp2 = window+overlap-1;
> +      for(;i<N4-((overlap+3)>>2);i++)
> +      {
> +         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
> +         *yp++ = *xp2;
> +         *yp++ = *xp1;
> +         xp1+=2;
> +         xp2-=2;
> +      }
> +      for(;i<N4;i++)
> +      {
> +         /* Real part arranged as a-bR, Imag part arranged as -c-dR */
> +         *yp++ =  -MULT16_32_Q15(*wp1, xp1[-N2]) + MULT16_32_Q15(*wp2, *xp2);
> +         *yp++ = MULT16_32_Q15(*wp2, *xp1)     + MULT16_32_Q15(*wp1, xp2[N2]);
> +         xp1+=2;
> +         xp2-=2;
> +         wp1+=2;
> +         wp2-=2;
> +      }
> +   }
> +   /* Pre-rotation */
> +   {
> +      kiss_fft_scalar * OPUS_RESTRICT yp = f;
> +      const kiss_twiddle_scalar *t = &trig[0];
> +      for(i=0;i<N4;i++)
> +      {
> +         kiss_fft_cpx yc;
> +         kiss_twiddle_scalar t0, t1;
> +         kiss_fft_scalar re, im, yr, yi;
> +         t0 = t[i];
> +         t1 = t[N4+i];
> +         re = *yp++;
> +         im = *yp++;
> +         yr = S_MUL(re,t0)  -  S_MUL(im,t1);
> +         yi = S_MUL(im,t0)  +  S_MUL(re,t1);
> +         yc.r = yr;
> +         yc.i = yi;
> +         f2[i] = yc;
> +      }
> +   }
> +
> +   /* N/4 complex FFT, does not downscale anymore */
> +   opus_fft(st, f2, (kiss_fft_cpx *)f, opus_select_arch());
> +
> +   /* Post-rotate */
> +   {
> +      /* Temp pointers to make it really clear to the compiler what we're doing */
> +      const kiss_fft_cpx * OPUS_RESTRICT fp = (kiss_fft_cpx *)f;
> +      kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
> +      kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
> +      const kiss_twiddle_scalar *t = &trig[0];
> +      /* Temp pointers to make it really clear to the compiler what we're doing */
> +      for(i=0;i<N4;i++)
> +      {
> +         kiss_fft_scalar yr, yi;
> +         yr = S_MUL(fp->i,t[N4+i]) - S_MUL(fp->r,t[i]);
> +         yi = S_MUL(fp->r,t[N4+i]) + S_MUL(fp->i,t[i]);
> +         *yp1 = yr;
> +         *yp2 = yi;
> +         fp++;
> +         yp1 += 2*stride;
> +         yp2 -= 2*stride;
> +      }
> +   }
> +   RESTORE_STACK;
> +}
> diff --git a/celt/arm/fft_arm.h b/celt/arm/fft_arm.h
> new file mode 100644
> index 0000000..16f008b
> --- /dev/null
> +++ b/celt/arm/fft_arm.h
> @@ -0,0 +1,65 @@
> +/* Copyright (c) 2015-2016 Xiph.Org Foundation
> +   Written by Viswanath Puttagunta */
> +/**
> +   @file fft_arm.h
> +   @brief ARM Neon Intrinsic optimizations for fft using NE10 library
> + */
> +
> +/*
> +   Redistribution and use in source and binary forms, with or without
> +   modification, are permitted provided that the following conditions
> +   are met:
> +
> +   - Redistributions of source code must retain the above copyright
> +   notice, this list of conditions and the following disclaimer.
> +
> +   - Redistributions in binary form must reproduce the above copyright
> +   notice, this list of conditions and the following disclaimer in the
> +   documentation and/or other materials provided with the distribution.
> +
> +   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> +   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> +   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> +   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
> +   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
> +   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
> +   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> +   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> +   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> +   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> +   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +*/
> +
> +
> +#if !defined(FFT_ARM_H)
> +#define FFT_ARM_H
> +
> +#include "config.h"
> +#include "kiss_fft.h"
> +
> +#if !defined(FIXED_POINT)
> +#if defined(HAVE_ARM_NE10)
> +
> +int opus_fft_alloc_arm_float_neon(kiss_fft_state *st);
> +void opus_fft_free_arm_float_neon(kiss_fft_state *st);
> +
> +void opus_fft_float_neon(const kiss_fft_state *st,
> +                        const kiss_fft_cpx *fin,
> +                        kiss_fft_cpx *fout);
> +#if !defined(OPUS_HAVE_RTCD)
> +#define OVERRIDE_OPUS_FFT (1)
> +
> +#define opus_fft_alloc_arch(_st, arch) \
> +   opus_fft_alloc_arm_float_neon(_st)
> +
> +#define opus_fft_free_arch(_st, arch) opus_fft_free_arm_float_neon(_st)
> +
> +#define opus_fft(_st, _fin, _fout, arch) \
> +   opus_fft_float_neon(_st, _fin, _fout)
> +
> +#endif /* OPUS_HAVE_RTCD */
> +
> +#endif /* HAVE_ARM_NE10 */
> +#endif /* FIXED_POINT */
> +
> +#endif
> diff --git a/celt/arm/mdct_arm.h b/celt/arm/mdct_arm.h
> new file mode 100644
> index 0000000..d0a8a8c
> --- /dev/null
> +++ b/celt/arm/mdct_arm.h
> @@ -0,0 +1,52 @@
> +/* Copyright (c) 2015-2016 Xiph.Org Foundation
> +   Written by Viswanath Puttagunta */
> +/**
> +   @file arm_mdct.h
> +   @brief ARM Neon Intrinsic optimizations for mdct using NE10 library
> + */
> +
> +/*
> +   Redistribution and use in source and binary forms, with or without
> +   modification, are permitted provided that the following conditions
> +   are met:
> +
> +   - Redistributions of source code must retain the above copyright
> +   notice, this list of conditions and the following disclaimer.
> +
> +   - Redistributions in binary form must reproduce the above copyright
> +   notice, this list of conditions and the following disclaimer in the
> +   documentation and/or other materials provided with the distribution.
> +
> +   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> +   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> +   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> +   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
> +   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
> +   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
> +   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> +   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> +   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> +   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> +   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +*/
> +
> +#if !defined(MDCT_ARM_H)
> +#define MDCT_ARM_H
> +
> +#include "config.h"
> +#include "mdct.h"
> +
> +#if !defined(FIXED_POINT) && defined(HAVE_ARM_NE10)
> +/** Compute a forward MDCT and scale by 4/N, trashes the input array */
> +void clt_mdct_forward_float_neon(const mdct_lookup *l, kiss_fft_scalar *in,
> +      kiss_fft_scalar * OPUS_RESTRICT out,
> +      const opus_val16 *window, int overlap, int shift, int stride);
> +
> +#if !defined(OPUS_HAVE_RTCD)
> +#define OVERRIDE_OPUS_MDCT (1)
> +#define clt_mdct_forward(_l, _in, _out, _window, _int, _shift, _stride, _arch) \
> +      clt_mdct_forward_float_neon((_l, _in, _out, _window, _int, _shift, _stride)
> +#endif /* OPUS_HAVE_RTCD */
> +#endif /* !defined(FIXED_POINT) && defined(HAVE_ARM_NE10) */
> +
> +#endif
> diff --git a/celt/celt_encoder.c b/celt/celt_encoder.c
> index 86a3fbb..9de9a92 100644
> --- a/celt/celt_encoder.c
> +++ b/celt/celt_encoder.c
> @@ -435,7 +435,9 @@ static void compute_mdcts(const CELTMode *mode, int shortBlocks, celt_sig * OPUS
>        for (b=0;b<B;b++)
>        {
>           /* Interleaving the sub-frames while doing the MDCTs */
> -         clt_mdct_forward(&mode->mdct, in+c*(B*N+overlap)+b*N, &out[b+c*N*B], mode->window, overlap, shift, B);
> +         clt_mdct_forward(&mode->mdct, in+c*(B*N+overlap)+b*N,
> +                        &out[b+c*N*B], mode->window, overlap, shift, B,
> +                        opus_select_arch());
>        }
>     } while (++c<CC);
>     if (CC==2&&C==1)
> diff --git a/celt/dump_modes/Makefile b/celt/dump_modes/Makefile
> index 74d527e..8890706 100644
> --- a/celt/dump_modes/Makefile
> +++ b/celt/dump_modes/Makefile
> @@ -1,10 +1,29 @@
> +
>  CFLAGS=-O2 -Wall -Wextra -DHAVE_CONFIG_H
>  INCLUDES=-I. -I../ -I../.. -I../../include
>
> +SOURCES = dump_modes.c \
> +          ../modes.c \
> +          ../cwrs.c \
> +          ../rate.c \
> +          ../entenc.c \
> +          ../entdec.c \
> +          ../mathops.c \
> +          ../mdct.c \
> +          ../kiss_fft.c
> +
> +ifdef HAVE_ARM_NE10
> +CC = gcc
> +CFLAGS += -mfpu=neon
> +INCLUDES += -I$(NE10_INCDIR) -DHAVE_ARM_NE10 -DOPUS_ARM_NEON_INTR
> +LIBDIR = -l:$(NE10_LIBDIR)/libNE10.so
> +SOURCES += ../arm/celt_neon_intr.c dump_mode_arm_ne10.c
> +endif
> +
>  all: dump_modes
>
>  dump_modes:
> -       $(CC) $(CFLAGS) $(INCLUDES) -DCUSTOM_MODES_ONLY -DCUSTOM_MODES dump_modes.c ../modes.c ../cwrs.c ../rate.c ../entenc.c ../entdec.c ../mathops.c ../mdct.c ../kiss_fft.c -o dump_modes -lm
> +       $(PREFIX)$(CC) $(CFLAGS) $(INCLUDES) -DCUSTOM_MODES_ONLY -DCUSTOM_MODES $(SOURCES) -o $@ $(LIBDIR) -lm
>
>  clean:
>         rm -f dump_modes
> diff --git a/celt/dump_modes/dump_mode_arm_ne10.c b/celt/dump_modes/dump_mode_arm_ne10.c
> new file mode 100644
> index 0000000..30c7423
> --- /dev/null
> +++ b/celt/dump_modes/dump_mode_arm_ne10.c
> @@ -0,0 +1,103 @@
> +/* Copyright (c) 2015-2016 Xiph.Org Foundation
> +   Written by Viswanath Puttagunta */
> +/*
> +   Redistribution and use in source and binary forms, with or without
> +   modification, are permitted provided that the following conditions
> +   are met:
> +
> +   - Redistributions of source code must retain the above copyright
> +   notice, this list of conditions and the following disclaimer.
> +
> +   - Redistributions in binary form must reproduce the above copyright
> +   notice, this list of conditions and the following disclaimer in the
> +   documentation and/or other materials provided with the distribution.
> +
> +   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> +   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> +   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> +   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
> +   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
> +   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
> +   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> +   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> +   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> +   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> +   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +*/
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include "modes.h"
> +#include "dump_modes_arch.h"
> +#include <NE10_dsp.h>
> +
> +static FILE *file;
> +
> +void dump_modes_arch_init(CELTMode **modes, int nb_modes)
> +{
> +   int i;
> +
> +   file = fopen(ARM_NE10_ARCH_FILE_NAME, "w");
> +   fprintf(file, "/* The contents of this file was automatically generated by\n");
> +   fprintf(file, " * dump_mode_arm_ne10.c with arguments:");
> +   for (i=0;i<nb_modes;i++)
> +   {
> +      CELTMode *mode = modes[i];
> +      fprintf(file, " %d %d",mode->Fs,mode->shortMdctSize*mode->nbShortMdcts);
> +   }
> +   fprintf(file, "\n * It contains static definitions for some pre-defined modes. */\n");
> +   fprintf(file, "#include <NE10_init.h>\n");
> +}
> +
> +void dump_modes_arch_finalize()
> +{
> +   fclose(file);
> +}
> +
> +void dump_mode_arch(CELTMode *mode)
> +{
> +   int k, j;
> +   int mdctSize;
> +
> +   mdctSize = mode->shortMdctSize*mode->nbShortMdcts;
> +
> +   fprintf(file, "#ifndef NE10_FFT_PARAMS%d_%d\n", mode->Fs, mdctSize);
> +   fprintf(file, "#define NE10_FFT_PARAMS%d_%d\n", mode->Fs, mdctSize);
> +   printf("Printing ne10 values\n");
> +   ne10_fft_cfg_float32_t cfg;
> +   /* cfg->factors */
> +   for(k=0;k<=mode->mdct.maxshift;k++) {
> +      fprintf(file, "static const ne10_int32_t ne10_factors_%d[%d] = {\n",
> +                  mode->mdct.kfft[k]->nfft, (NE10_MAXFACTORS * 2));
> +      for(j=0;j<(NE10_MAXFACTORS * 2);j++) {
> +         cfg = (ne10_fft_cfg_float32_t)mode->mdct.kfft[k]->priv;
> +         fprintf(file, "%d,%c", cfg->factors[j],(j+16)%15==0?'\n':' ');
> +      }
> +      fprintf (file, "};\n");
> +   }
> +
> +   /* cfg->twiddles */
> +   for(k=0;k<=mode->mdct.maxshift;k++) {
> +      fprintf(file, "static const ne10_fft_cpx_float32_t ne10_twiddles_%d[%d] = {\n",
> +                  mode->mdct.kfft[k]->nfft, mode->mdct.kfft[k]->nfft);
> +      for(j=0;j<mode->mdct.kfft[k]->nfft;j++) {
> +         cfg = (ne10_fft_cfg_float32_t)mode->mdct.kfft[k]->priv;
> +         fprintf(file, "{%f,%f},%c", cfg->twiddles[j].r, cfg->twiddles[j].i,(j+4)%3==0?'\n':' ');
> +      }
> +      fprintf (file, "};\n");
> +   }
> +
> +   for(k=0;k<=mode->mdct.maxshift;k++) {
> +      fprintf(file, "static const ne10_fft_state_float32_t cfg_arch_%d = {\n",
> +               mode->mdct.kfft[k]->nfft);
> +      cfg = (ne10_fft_cfg_float32_t)mode->mdct.kfft[k]->priv;
> +      fprintf(file, "%d,\n", cfg->nfft);
> +      fprintf(file, "(ne10_int32_t *)ne10_factors_%d,\n", mode->mdct.kfft[k]->nfft);
> +      fprintf(file, "(ne10_fft_cpx_float32_t *)ne10_twiddles_%d,\n", mode->mdct.kfft[k]->nfft);
> +      fprintf(file, "NULL,\n");  /* buffer */
> +      fprintf(file, "(ne10_fft_cpx_float32_t *)&ne10_twiddles_%d[%d],\n",
> +                     mode->mdct.kfft[k]->nfft, cfg->nfft);
> +      fprintf(file, "};\n");
> +   }
> +   fprintf(file, "#endif  /*end NE10_FFT_PARAMS%d_%d*/\n", mode->Fs, mdctSize);
> +}
> diff --git a/celt/dump_modes/dump_modes.c b/celt/dump_modes/dump_modes.c
> index ae6a8c1..80947ec 100644
> --- a/celt/dump_modes/dump_modes.c
> +++ b/celt/dump_modes/dump_modes.c
> @@ -35,6 +35,7 @@
>  #include "modes.h"
>  #include "celt.h"
>  #include "rate.h"
> +#include "dump_modes_arch.h"
>
>  #define INT16 "%d"
>  #define INT32 "%d"
> @@ -62,6 +63,10 @@ void dump_modes(FILE *file, CELTMode **modes, int nb_modes)
>     fprintf(file, "\n   It contains static definitions for some pre-defined modes. */\n");
>     fprintf(file, "#include \"modes.h\"\n");
>     fprintf(file, "#include \"rate.h\"\n");
> +   fprintf(file, "\n#ifdef HAVE_ARM_NE10\n");
> +   fprintf(file, "#define OVERRIDE_FFT 1\n");
> +   fprintf(file, "#include \"%s\"\n", ARM_NE10_ARCH_FILE_NAME);
> +   fprintf(file, "#endif\n");
>
>     fprintf(file, "\n");
>
> @@ -149,6 +154,9 @@ void dump_modes(FILE *file, CELTMode **modes, int nb_modes)
>           fprintf (file, "{" WORD16 ", " WORD16 "},%c", mode->mdct.kfft[0]->twiddles[j].r, mode->mdct.kfft[0]->twiddles[j].i,(j+3)%2==0?'\n':' ');
>        fprintf (file, "};\n");
>
> +#ifdef OVERRIDE_FFT
> +      dump_mode_arch(mode);
> +#endif
>        /* FFT Bitrev tables */
>        for (k=0;k<=mode->mdct.maxshift;k++)
>        {
> @@ -183,6 +191,13 @@ void dump_modes(FILE *file, CELTMode **modes, int nb_modes)
>           fprintf (file, "},    /* factors */\n");
>           fprintf (file, "fft_bitrev%d,    /* bitrev */\n", mode->mdct.kfft[k]->nfft);
>           fprintf (file, "fft_twiddles%d_%d,    /* bitrev */\n", mode->Fs, mdctSize);
> +
> +         fprintf (file, "#ifdef OVERRIDE_FFT\n");
> +         fprintf (file, "(void *)&cfg_arch_%d,\n", mode->mdct.kfft[k]->nfft);
> +         fprintf (file, "#else\n");
> +         fprintf (file, "NULL,\n");
> +         fprintf(file, "#endif\n");
> +
>           fprintf (file, "};\n");
>
>           fprintf(file, "#endif\n");
> @@ -205,7 +220,6 @@ void dump_modes(FILE *file, CELTMode **modes, int nb_modes)
>        fprintf(file, "#endif\n");
>        fprintf(file, "\n");
>
> -
>        /* Print the actual mode data */
>        fprintf(file, "static const CELTMode mode%d_%d_%d = {\n", mode->Fs, mdctSize, mode->overlap);
>        fprintf(file, INT32 ",    /* Fs */\n", mode->Fs);
> @@ -323,8 +337,14 @@ int main(int argc, char **argv)
>        }
>     }
>     file = fopen(BASENAME ".h", "w");
> +#ifdef OVERRIDE_FFT
> +   dump_modes_arch_init(m, nb);
> +#endif
>     dump_modes(file, m, nb);
>     fclose(file);
> +#ifdef OVERRIDE_FFT
> +   dump_modes_arch_finalize();
> +#endif
>     for (i=0;i<nb;i++)
>        opus_custom_mode_destroy(m[i]);
>     free(m);
> diff --git a/celt/dump_modes/dump_modes_arch.h b/celt/dump_modes/dump_modes_arch.h
> new file mode 100644
> index 0000000..f7df55b
> --- /dev/null
> +++ b/celt/dump_modes/dump_modes_arch.h
> @@ -0,0 +1,14 @@
> +#ifndef DUMP_MODE_ARCH_H
> +#define DUMP_MODE_ARCH_H
> +
> +void dump_modes_arch_init();
> +void dump_mode_arch(CELTMode *mode);
> +void dump_modes_arch_finalize();
> +
> +#define ARM_NE10_ARCH_FILE_NAME "static_modes_float_arm_ne10.h"
> +
> +#if defined(HAVE_ARM_NE10)
> +#define OVERRIDE_FFT (1)
> +#endif
> +
> +#endif
> diff --git a/celt/kiss_fft.c b/celt/kiss_fft.c
> index cc487fc..9a76206 100644
> --- a/celt/kiss_fft.c
> +++ b/celt/kiss_fft.c
> @@ -423,6 +423,11 @@ static void compute_twiddles(kiss_twiddle_cpx *twiddles, int nfft)
>  #endif
>  }
>
> +int opus_fft_alloc_arch_c(kiss_fft_state *st) {
> +   (void)st;
> +   return 0;
> +}
> +
>  /*
>   *
>   * Allocates all necessary storage space for the fft and ifft.
> @@ -478,6 +483,10 @@ kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,  co
>          if (st->bitrev==NULL)
>              goto fail;
>          compute_bitrev_table(0, bitrev, 1,1, st->factors,st);
> +
> +        /* Initialize architecture specific fft parameters */
> +        if (opus_fft_alloc_arch(st, opus_select_arch()))
> +            goto fail;
>      }
>      return st;
>  fail:
> @@ -485,15 +494,20 @@ fail:
>      return NULL;
>  }
>
> -kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem )
> +kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem)
>  {
>     return opus_fft_alloc_twiddles(nfft, mem, lenmem, NULL);
>  }
>
> +void opus_fft_free_arch_c(kiss_fft_state *st) {
> +   (void)st;
> +}
> +
>  void opus_fft_free(const kiss_fft_state *cfg)
>  {
>     if (cfg)
>     {
> +      opus_fft_free_arch((kiss_fft_state *)cfg, opus_select_arch());
>        opus_free((opus_int16*)cfg->bitrev);
>        if (cfg->shift < 0)
>           opus_free((kiss_twiddle_cpx*)cfg->twiddles);
> @@ -551,7 +565,7 @@ void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout)
>      }
>  }
>
> -void opus_fft(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
> +void opus_fft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
>  {
>     int i;
>     opus_val16 scale;
> diff --git a/celt/kiss_fft.h b/celt/kiss_fft.h
> index 390b54d..f9232f9 100644
> --- a/celt/kiss_fft.h
> +++ b/celt/kiss_fft.h
> @@ -32,6 +32,7 @@
>  #include <stdlib.h>
>  #include <math.h>
>  #include "arch.h"
> +#include "cpu_support.h"
>
>  #ifdef __cplusplus
>  extern "C" {
> @@ -59,6 +60,7 @@ extern "C" {
>  #   define kiss_twiddle_scalar float
>  #   define KF_SUFFIX _celt_single
>  # endif
> +
>  #endif
>
>  typedef struct {
> @@ -87,8 +89,13 @@ typedef struct kiss_fft_state{
>      opus_int16 factors[2*MAXFACTORS];
>      const opus_int16 *bitrev;
>      const kiss_twiddle_cpx *twiddles;
> +    void *priv; /* Used by arch specfic optimizations */
>  } kiss_fft_state;
>
> +#if !defined(FIXED_POINT) && defined(HAVE_ARM_NE10)
> +#include "arm/fft_arm.h"
> +#endif
> +
>  /*typedef struct kiss_fft_state* kiss_fft_cfg;*/
>
>  /**
> @@ -128,7 +135,7 @@ kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem);
>   * Note that each element is complex and can be accessed like
>      f[k].r and f[k].i
>   * */
> -void opus_fft(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
> +void opus_fft_c(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
>  void opus_ifft(const kiss_fft_state *cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout);
>
>  void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout);
> @@ -136,6 +143,41 @@ void opus_ifft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout);
>
>  void opus_fft_free(const kiss_fft_state *cfg);
>
> +
> +void opus_fft_free_arch_c(kiss_fft_state *st);
> +int opus_fft_alloc_arch_c(kiss_fft_state *st);
> +
> +#if !defined(OVERRIDE_OPUS_FFT)
> +/* Is run-time CPU detection enabled on this platform? */
> +#if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10))
> +
> +int (*const OPUS_FFT_ALLOC_ARCH[OPUS_ARCHMASK+1])(kiss_fft_state *st);
> +
> +#define opus_fft_alloc_arch(_st, arch) \
> +         ((*OPUS_FFT_ALLOC_ARCH[(arch)&OPUS_ARCHMASK])(_st))
> +
> +void (*const OPUS_FFT_FREE_ARCH[OPUS_ARCHMASK+1])(kiss_fft_state *st);
> +#define opus_fft_free_arch(_st, arch) \
> +         ((*OPUS_FFT_FREE_ARCH[(arch)&OPUS_ARCHMASK])(_st))
> +
> +void (*const OPUS_FFT[OPUS_ARCHMASK+1])(const kiss_fft_state *cfg,
> +                                       const kiss_fft_cpx *fin,
> +                                       kiss_fft_cpx *fout);
> +#define opus_fft(_cfg, _fin, _fout, arch) \
> +   ((*OPUS_FFT[(arch)&OPUS_ARCHMASK])(_cfg, _fin, _fout))
> +#else /* else for if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */
> +
> +#define opus_fft_alloc_arch(_st, arch) \
> +         opus_fft_alloc_arch_c(_st)
> +
> +#define opus_fft_free_arch(_st, arch) \
> +         opus_fft_free_arch_c(_st)
> +
> +#define opus_fft(_cfg, _fin, _fout, arch) \
> +         opus_fft_c(_cfg, _fin, _fout)
> +#endif /* end if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */
> +#endif /* end if !defined(OVERRIDE_OPUS_FFT) */
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/celt/mdct.c b/celt/mdct.c
> index 2795d90..7e55157 100644
> --- a/celt/mdct.c
> +++ b/celt/mdct.c
> @@ -116,7 +116,7 @@ void clt_mdct_clear(mdct_lookup *l)
>
>  /* Forward MDCT trashes the input array */
>  #ifndef OVERRIDE_clt_mdct_forward
> -void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
> +void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
>        const opus_val16 *window, int overlap, int shift, int stride)
>  {
>     int i;
> diff --git a/celt/mdct.h b/celt/mdct.h
> index d721821..52d7cca 100644
> --- a/celt/mdct.h
> +++ b/celt/mdct.h
> @@ -53,11 +53,16 @@ typedef struct {
>     const kiss_twiddle_scalar * OPUS_RESTRICT trig;
>  } mdct_lookup;
>
> +#if !defined(FIXED_POINT) && defined(HAVE_ARM_NE10)
> +#include "arm/mdct_arm.h"
> +#endif
> +
> +
>  int clt_mdct_init(mdct_lookup *l,int N, int maxshift);
>  void clt_mdct_clear(mdct_lookup *l);
>
>  /** Compute a forward MDCT and scale by 4/N, trashes the input array */
> -void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in,
> +void clt_mdct_forward_c(const mdct_lookup *l, kiss_fft_scalar *in,
>        kiss_fft_scalar * OPUS_RESTRICT out,
>        const opus_val16 *window, int overlap, int shift, int stride);
>
> @@ -67,4 +72,26 @@ void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in,
>        kiss_fft_scalar * OPUS_RESTRICT out,
>        const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride);
>
> +#if !defined(OVERRIDE_OPUS_MDCT)
> +/* Is run-time CPU detection enabled on this platform? */
> +#if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10))
> +
> +void (*const CLT_MDCT_FORWARD_ARCH[OPUS_ARCHMASK+1])(const mdct_lookup *l,
> +                                      kiss_fft_scalar *in,
> +                                      kiss_fft_scalar * OPUS_RESTRICT out,
> +                                      const opus_val16 *window,
> +                                      int overlap, int shift, int stride);
> +
> +
> +#define clt_mdct_forward(_l, _in, _out, _window, _overlap, _shift, _stride, arch) \
> +   ((*CLT_MDCT_FORWARD_ARCH[(arch)&OPUS_ARCHMASK])(_l, _in, _out, \
> +                                      _window, _overlap, _shift, _stride))
> +#else /* else for if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */
> +
> +#define clt_mdct_forward(_l, _in, _out, _window, _overlap, _shift, _stride, arch) \
> +   clt_mdct_forward_c(_l, _in, _out, _window, _overlap, _shift, _stride)
> +
> +#endif /* end if defined(OPUS_HAVE_RTCD) && (defined(HAVE_ARM_NE10)) */
> +#endif /* end if !defined(OVERRIDE_OPUS_MDCT) */
> +
>  #endif
> diff --git a/celt/static_modes_float.h b/celt/static_modes_float.h
> index 2fadb62..e115b79 100644
> --- a/celt/static_modes_float.h
> +++ b/celt/static_modes_float.h
> @@ -4,6 +4,11 @@
>  #include "modes.h"
>  #include "rate.h"
>
> +#ifdef HAVE_ARM_NE10
> +#define OVERRIDE_FFT 1
> +#include "static_modes_float_arm_ne10.h"
> +#endif
> +
>  #ifndef DEF_WINDOW120
>  #define DEF_WINDOW120
>  static const opus_val16 window120[120] = {
> @@ -431,6 +436,11 @@ static const kiss_fft_state fft_state48000_960_0 = {
>  {5, 96, 3, 32, 4, 8, 2, 4, 4, 1, 0, 0, 0, 0, 0, 0, },   /* factors */
>  fft_bitrev480,  /* bitrev */
>  fft_twiddles48000_960,  /* bitrev */
> +#ifdef OVERRIDE_FFT
> +(void *)&cfg_arch_480,
> +#else
> +NULL,
> +#endif
>  };
>  #endif
>
> @@ -443,6 +453,11 @@ static const kiss_fft_state fft_state48000_960_1 = {
>  {5, 48, 3, 16, 4, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
>  fft_bitrev240,  /* bitrev */
>  fft_twiddles48000_960,  /* bitrev */
> +#ifdef OVERRIDE_FFT
> +(void *)&cfg_arch_240,
> +#else
> +NULL,
> +#endif
>  };
>  #endif
>
> @@ -455,6 +470,11 @@ static const kiss_fft_state fft_state48000_960_2 = {
>  {5, 24, 3, 8, 2, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
>  fft_bitrev120,  /* bitrev */
>  fft_twiddles48000_960,  /* bitrev */
> +#ifdef OVERRIDE_FFT
> +(void *)&cfg_arch_120,
> +#else
> +NULL,
> +#endif
>  };
>  #endif
>
> @@ -467,6 +487,11 @@ static const kiss_fft_state fft_state48000_960_3 = {
>  {5, 12, 3, 4, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },    /* factors */
>  fft_bitrev60,   /* bitrev */
>  fft_twiddles48000_960,  /* bitrev */
> +#ifdef OVERRIDE_FFT
> +(void *)&cfg_arch_60,
> +#else
> +NULL,
> +#endif
>  };
>  #endif
>
> diff --git a/celt/static_modes_float_arm_ne10.h b/celt/static_modes_float_arm_ne10.h
> new file mode 100644
> index 0000000..4229048
> --- /dev/null
> +++ b/celt/static_modes_float_arm_ne10.h
> @@ -0,0 +1,367 @@
> +/* The contents of this file was automatically generated by
> + * dump_mode_arm_ne10.c with arguments: 48000 960
> + * It contains static definitions for some pre-defined modes. */
> +#include <NE10_init.h>
> +#ifndef NE10_FFT_PARAMS48000_960
> +#define NE10_FFT_PARAMS48000_960
> +static const ne10_int32_t ne10_factors_480[64] = {
> +4, 40, 4, 30, 2, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, };
> +static const ne10_int32_t ne10_factors_240[64] = {
> +3, 20, 4, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, };
> +static const ne10_int32_t ne10_factors_120[64] = {
> +3, 10, 2, 15, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, };
> +static const ne10_int32_t ne10_factors_60[64] = {
> +2, 5, 5, 3, 3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
> +0, 0, 0, 0, };
> +static const ne10_fft_cpx_float32_t ne10_twiddles_480[480] = {
> +{1.000000,0.000000}, {1.000000,-0.000000}, {1.000000,-0.000000},
> +{1.000000,-0.000000}, {0.913545,-0.406737}, {0.669131,-0.743145},
> +{1.000000,-0.000000}, {0.669131,-0.743145}, {-0.104529,-0.994522},
> +{1.000000,-0.000000}, {0.309017,-0.951057}, {-0.809017,-0.587785},
> +{1.000000,-0.000000}, {-0.104529,-0.994522}, {-0.978148,0.207912},
> +{1.000000,-0.000000}, {0.978148,-0.207912}, {0.913545,-0.406737},
> +{0.809017,-0.587785}, {0.669131,-0.743145}, {0.500000,-0.866025},
> +{0.309017,-0.951057}, {0.104528,-0.994522}, {-0.104529,-0.994522},
> +{-0.309017,-0.951056}, {-0.500000,-0.866025}, {-0.669131,-0.743145},
> +{-0.809017,-0.587785}, {-0.913545,-0.406737}, {-0.978148,-0.207912},
> +{1.000000,-0.000000}, {0.998630,-0.052336}, {0.994522,-0.104528},
> +{0.987688,-0.156434}, {0.978148,-0.207912}, {0.965926,-0.258819},
> +{0.951057,-0.309017}, {0.933580,-0.358368}, {0.913545,-0.406737},
> +{0.891007,-0.453991}, {0.866025,-0.500000}, {0.838671,-0.544639},
> +{0.809017,-0.587785}, {0.777146,-0.629320}, {0.743145,-0.669131},
> +{0.707107,-0.707107}, {0.669131,-0.743145}, {0.629320,-0.777146},
> +{0.587785,-0.809017}, {0.544639,-0.838671}, {0.500000,-0.866025},
> +{0.453991,-0.891007}, {0.406737,-0.913545}, {0.358368,-0.933580},
> +{0.309017,-0.951057}, {0.258819,-0.965926}, {0.207912,-0.978148},
> +{0.156434,-0.987688}, {0.104528,-0.994522}, {0.052336,-0.998630},
> +{1.000000,-0.000000}, {0.994522,-0.104528}, {0.978148,-0.207912},
> +{0.951057,-0.309017}, {0.913545,-0.406737}, {0.866025,-0.500000},
> +{0.809017,-0.587785}, {0.743145,-0.669131}, {0.669131,-0.743145},
> +{0.587785,-0.809017}, {0.500000,-0.866025}, {0.406737,-0.913545},
> +{0.309017,-0.951057}, {0.207912,-0.978148}, {0.104528,-0.994522},
> +{-0.000000,-1.000000}, {-0.104529,-0.994522}, {-0.207912,-0.978148},
> +{-0.309017,-0.951056}, {-0.406737,-0.913545}, {-0.500000,-0.866025},
> +{-0.587785,-0.809017}, {-0.669131,-0.743145}, {-0.743145,-0.669130},
> +{-0.809017,-0.587785}, {-0.866025,-0.500000}, {-0.913545,-0.406737},
> +{-0.951057,-0.309017}, {-0.978148,-0.207912}, {-0.994522,-0.104528},
> +{1.000000,-0.000000}, {0.987688,-0.156434}, {0.951057,-0.309017},
> +{0.891007,-0.453991}, {0.809017,-0.587785}, {0.707107,-0.707107},
> +{0.587785,-0.809017}, {0.453991,-0.891007}, {0.309017,-0.951057},
> +{0.156434,-0.987688}, {-0.000000,-1.000000}, {-0.156434,-0.987688},
> +{-0.309017,-0.951056}, {-0.453991,-0.891006}, {-0.587785,-0.809017},
> +{-0.707107,-0.707107}, {-0.809017,-0.587785}, {-0.891007,-0.453990},
> +{-0.951057,-0.309017}, {-0.987688,-0.156434}, {-1.000000,0.000000},
> +{-0.987688,0.156435}, {-0.951057,0.309017}, {-0.891007,0.453991},
> +{-0.809017,0.587785}, {-0.707107,0.707107}, {-0.587785,0.809017},
> +{-0.453990,0.891007}, {-0.309017,0.951056}, {-0.156435,0.987688},
> +{1.000000,-0.000000}, {0.999914,-0.013090}, {0.999657,-0.026177},
> +{0.999229,-0.039260}, {0.998630,-0.052336}, {0.997859,-0.065403},
> +{0.996917,-0.078459}, {0.995805,-0.091502}, {0.994522,-0.104528},
> +{0.993068,-0.117537}, {0.991445,-0.130526}, {0.989651,-0.143493},
> +{0.987688,-0.156434}, {0.985556,-0.169350}, {0.983255,-0.182236},
> +{0.980785,-0.195090}, {0.978148,-0.207912}, {0.975342,-0.220697},
> +{0.972370,-0.233445}, {0.969231,-0.246153}, {0.965926,-0.258819},
> +{0.962455,-0.271440}, {0.958820,-0.284015}, {0.955020,-0.296542},
> +{0.951057,-0.309017}, {0.946930,-0.321439}, {0.942641,-0.333807},
> +{0.938191,-0.346117}, {0.933580,-0.358368}, {0.928810,-0.370557},
> +{0.923880,-0.382683}, {0.918791,-0.394744}, {0.913545,-0.406737},
> +{0.908143,-0.418660}, {0.902585,-0.430511}, {0.896873,-0.442289},
> +{0.891007,-0.453991}, {0.884988,-0.465615}, {0.878817,-0.477159},
> +{0.872496,-0.488621}, {0.866025,-0.500000}, {0.859406,-0.511293},
> +{0.852640,-0.522499}, {0.845728,-0.533615}, {0.838671,-0.544639},
> +{0.831470,-0.555570}, {0.824126,-0.566406}, {0.816642,-0.577145},
> +{0.809017,-0.587785}, {0.801254,-0.598325}, {0.793353,-0.608761},
> +{0.785317,-0.619094}, {0.777146,-0.629320}, {0.768842,-0.639439},
> +{0.760406,-0.649448}, {0.751840,-0.659346}, {0.743145,-0.669131},
> +{0.734322,-0.678801}, {0.725374,-0.688355}, {0.716302,-0.697791},
> +{0.707107,-0.707107}, {0.697790,-0.716302}, {0.688355,-0.725374},
> +{0.678801,-0.734323}, {0.669131,-0.743145}, {0.659346,-0.751840},
> +{0.649448,-0.760406}, {0.639439,-0.768842}, {0.629320,-0.777146},
> +{0.619094,-0.785317}, {0.608761,-0.793353}, {0.598325,-0.801254},
> +{0.587785,-0.809017}, {0.577145,-0.816642}, {0.566406,-0.824126},
> +{0.555570,-0.831470}, {0.544639,-0.838671}, {0.533615,-0.845728},
> +{0.522498,-0.852640}, {0.511293,-0.859406}, {0.500000,-0.866025},
> +{0.488621,-0.872496}, {0.477159,-0.878817}, {0.465614,-0.884988},
> +{0.453991,-0.891007}, {0.442289,-0.896873}, {0.430511,-0.902585},
> +{0.418660,-0.908143}, {0.406737,-0.913545}, {0.394744,-0.918791},
> +{0.382683,-0.923880}, {0.370557,-0.928810}, {0.358368,-0.933580},
> +{0.346117,-0.938191}, {0.333807,-0.942641}, {0.321439,-0.946930},
> +{0.309017,-0.951057}, {0.296542,-0.955020}, {0.284015,-0.958820},
> +{0.271440,-0.962455}, {0.258819,-0.965926}, {0.246153,-0.969231},
> +{0.233445,-0.972370}, {0.220697,-0.975342}, {0.207912,-0.978148},
> +{0.195090,-0.980785}, {0.182236,-0.983255}, {0.169349,-0.985556},
> +{0.156434,-0.987688}, {0.143493,-0.989651}, {0.130526,-0.991445},
> +{0.117537,-0.993068}, {0.104528,-0.994522}, {0.091502,-0.995805},
> +{0.078459,-0.996917}, {0.065403,-0.997859}, {0.052336,-0.998630},
> +{0.039260,-0.999229}, {0.026177,-0.999657}, {0.013090,-0.999914},
> +{1.000000,-0.000000}, {0.999657,-0.026177}, {0.998630,-0.052336},
> +{0.996917,-0.078459}, {0.994522,-0.104528}, {0.991445,-0.130526},
> +{0.987688,-0.156434}, {0.983255,-0.182236}, {0.978148,-0.207912},
> +{0.972370,-0.233445}, {0.965926,-0.258819}, {0.958820,-0.284015},
> +{0.951057,-0.309017}, {0.942641,-0.333807}, {0.933580,-0.358368},
> +{0.923880,-0.382683}, {0.913545,-0.406737}, {0.902585,-0.430511},
> +{0.891007,-0.453991}, {0.878817,-0.477159}, {0.866025,-0.500000},
> +{0.852640,-0.522499}, {0.838671,-0.544639}, {0.824126,-0.566406},
> +{0.809017,-0.587785}, {0.793353,-0.608761}, {0.777146,-0.629320},
> +{0.760406,-0.649448}, {0.743145,-0.669131}, {0.725374,-0.688355},
> +{0.707107,-0.707107}, {0.688355,-0.725374}, {0.669131,-0.743145},
> +{0.649448,-0.760406}, {0.629320,-0.777146}, {0.608761,-0.793353},
> +{0.587785,-0.809017}, {0.566406,-0.824126}, {0.544639,-0.838671},
> +{0.522498,-0.852640}, {0.500000,-0.866025}, {0.477159,-0.878817},
> +{0.453991,-0.891007}, {0.430511,-0.902585}, {0.406737,-0.913545},
> +{0.382683,-0.923880}, {0.358368,-0.933580}, {0.333807,-0.942641},
> +{0.309017,-0.951057}, {0.284015,-0.958820}, {0.258819,-0.965926},
> +{0.233445,-0.972370}, {0.207912,-0.978148}, {0.182236,-0.983255},
> +{0.156434,-0.987688}, {0.130526,-0.991445}, {0.104528,-0.994522},
> +{0.078459,-0.996917}, {0.052336,-0.998630}, {0.026177,-0.999657},
> +{-0.000000,-1.000000}, {-0.026177,-0.999657}, {-0.052336,-0.998630},
> +{-0.078459,-0.996917}, {-0.104529,-0.994522}, {-0.130526,-0.991445},
> +{-0.156434,-0.987688}, {-0.182236,-0.983255}, {-0.207912,-0.978148},
> +{-0.233445,-0.972370}, {-0.258819,-0.965926}, {-0.284015,-0.958820},
> +{-0.309017,-0.951056}, {-0.333807,-0.942641}, {-0.358368,-0.933580},
> +{-0.382684,-0.923880}, {-0.406737,-0.913545}, {-0.430511,-0.902585},
> +{-0.453991,-0.891006}, {-0.477159,-0.878817}, {-0.500000,-0.866025},
> +{-0.522499,-0.852640}, {-0.544639,-0.838671}, {-0.566406,-0.824126},
> +{-0.587785,-0.809017}, {-0.608761,-0.793353}, {-0.629321,-0.777146},
> +{-0.649448,-0.760406}, {-0.669131,-0.743145}, {-0.688355,-0.725374},
> +{-0.707107,-0.707107}, {-0.725374,-0.688354}, {-0.743145,-0.669130},
> +{-0.760406,-0.649448}, {-0.777146,-0.629320}, {-0.793353,-0.608761},
> +{-0.809017,-0.587785}, {-0.824126,-0.566406}, {-0.838671,-0.544639},
> +{-0.852640,-0.522498}, {-0.866025,-0.500000}, {-0.878817,-0.477159},
> +{-0.891007,-0.453990}, {-0.902585,-0.430511}, {-0.913545,-0.406737},
> +{-0.923880,-0.382683}, {-0.933580,-0.358368}, {-0.942642,-0.333807},
> +{-0.951057,-0.309017}, {-0.958820,-0.284015}, {-0.965926,-0.258819},
> +{-0.972370,-0.233445}, {-0.978148,-0.207912}, {-0.983255,-0.182235},
> +{-0.987688,-0.156434}, {-0.991445,-0.130526}, {-0.994522,-0.104528},
> +{-0.996917,-0.078459}, {-0.998630,-0.052336}, {-0.999657,-0.026177},
> +{1.000000,-0.000000}, {0.999229,-0.039260}, {0.996917,-0.078459},
> +{0.993068,-0.117537}, {0.987688,-0.156434}, {0.980785,-0.195090},
> +{0.972370,-0.233445}, {0.962455,-0.271440}, {0.951057,-0.309017},
> +{0.938191,-0.346117}, {0.923880,-0.382683}, {0.908143,-0.418660},
> +{0.891007,-0.453991}, {0.872496,-0.488621}, {0.852640,-0.522499},
> +{0.831470,-0.555570}, {0.809017,-0.587785}, {0.785317,-0.619094},
> +{0.760406,-0.649448}, {0.734322,-0.678801}, {0.707107,-0.707107},
> +{0.678801,-0.734323}, {0.649448,-0.760406}, {0.619094,-0.785317},
> +{0.587785,-0.809017}, {0.555570,-0.831470}, {0.522498,-0.852640},
> +{0.488621,-0.872496}, {0.453991,-0.891007}, {0.418660,-0.908143},
> +{0.382683,-0.923880}, {0.346117,-0.938191}, {0.309017,-0.951057},
> +{0.271440,-0.962455}, {0.233445,-0.972370}, {0.195090,-0.980785},
> +{0.156434,-0.987688}, {0.117537,-0.993068}, {0.078459,-0.996917},
> +{0.039260,-0.999229}, {-0.000000,-1.000000}, {-0.039260,-0.999229},
> +{-0.078459,-0.996917}, {-0.117537,-0.993068}, {-0.156434,-0.987688},
> +{-0.195090,-0.980785}, {-0.233445,-0.972370}, {-0.271440,-0.962455},
> +{-0.309017,-0.951056}, {-0.346117,-0.938191}, {-0.382684,-0.923880},
> +{-0.418660,-0.908143}, {-0.453991,-0.891006}, {-0.488621,-0.872496},
> +{-0.522499,-0.852640}, {-0.555570,-0.831470}, {-0.587785,-0.809017},
> +{-0.619094,-0.785317}, {-0.649448,-0.760406}, {-0.678801,-0.734322},
> +{-0.707107,-0.707107}, {-0.734323,-0.678801}, {-0.760406,-0.649448},
> +{-0.785317,-0.619094}, {-0.809017,-0.587785}, {-0.831470,-0.555570},
> +{-0.852640,-0.522498}, {-0.872496,-0.488621}, {-0.891007,-0.453990},
> +{-0.908143,-0.418660}, {-0.923880,-0.382683}, {-0.938191,-0.346117},
> +{-0.951057,-0.309017}, {-0.962455,-0.271440}, {-0.972370,-0.233445},
> +{-0.980785,-0.195090}, {-0.987688,-0.156434}, {-0.993068,-0.117537},
> +{-0.996917,-0.078459}, {-0.999229,-0.039260}, {-1.000000,0.000000},
> +{-0.999229,0.039260}, {-0.996917,0.078459}, {-0.993068,0.117538},
> +{-0.987688,0.156435}, {-0.980785,0.195090}, {-0.972370,0.233446},
> +{-0.962455,0.271441}, {-0.951057,0.309017}, {-0.938191,0.346117},
> +{-0.923880,0.382683}, {-0.908143,0.418660}, {-0.891007,0.453991},
> +{-0.872496,0.488621}, {-0.852640,0.522499}, {-0.831470,0.555570},
> +{-0.809017,0.587785}, {-0.785317,0.619094}, {-0.760406,0.649448},
> +{-0.734322,0.678801}, {-0.707107,0.707107}, {-0.678801,0.734323},
> +{-0.649448,0.760406}, {-0.619094,0.785317}, {-0.587785,0.809017},
> +{-0.555570,0.831470}, {-0.522498,0.852640}, {-0.488621,0.872496},
> +{-0.453990,0.891007}, {-0.418659,0.908143}, {-0.382683,0.923880},
> +{-0.346117,0.938191}, {-0.309017,0.951056}, {-0.271441,0.962455},
> +{-0.233445,0.972370}, {-0.195090,0.980785}, {-0.156435,0.987688},
> +{-0.117537,0.993068}, {-0.078459,0.996917}, {-0.039260,0.999229},
> +};
> +static const ne10_fft_cpx_float32_t ne10_twiddles_240[240] = {
> +{1.000000,0.000000}, {1.000000,-0.000000}, {1.000000,-0.000000},
> +{1.000000,-0.000000}, {0.913545,-0.406737}, {0.669131,-0.743145},
> +{1.000000,-0.000000}, {0.669131,-0.743145}, {-0.104529,-0.994522},
> +{1.000000,-0.000000}, {0.309017,-0.951057}, {-0.809017,-0.587785},
> +{1.000000,-0.000000}, {-0.104529,-0.994522}, {-0.978148,0.207912},
> +{1.000000,-0.000000}, {0.994522,-0.104528}, {0.978148,-0.207912},
> +{0.951057,-0.309017}, {0.913545,-0.406737}, {0.866025,-0.500000},
> +{0.809017,-0.587785}, {0.743145,-0.669131}, {0.669131,-0.743145},
> +{0.587785,-0.809017}, {0.500000,-0.866025}, {0.406737,-0.913545},
> +{0.309017,-0.951057}, {0.207912,-0.978148}, {0.104528,-0.994522},
> +{1.000000,-0.000000}, {0.978148,-0.207912}, {0.913545,-0.406737},
> +{0.809017,-0.587785}, {0.669131,-0.743145}, {0.500000,-0.866025},
> +{0.309017,-0.951057}, {0.104528,-0.994522}, {-0.104529,-0.994522},
> +{-0.309017,-0.951056}, {-0.500000,-0.866025}, {-0.669131,-0.743145},
> +{-0.809017,-0.587785}, {-0.913545,-0.406737}, {-0.978148,-0.207912},
> +{1.000000,-0.000000}, {0.951057,-0.309017}, {0.809017,-0.587785},
> +{0.587785,-0.809017}, {0.309017,-0.951057}, {-0.000000,-1.000000},
> +{-0.309017,-0.951056}, {-0.587785,-0.809017}, {-0.809017,-0.587785},
> +{-0.951057,-0.309017}, {-1.000000,0.000000}, {-0.951057,0.309017},
> +{-0.809017,0.587785}, {-0.587785,0.809017}, {-0.309017,0.951056},
> +{1.000000,-0.000000}, {0.999657,-0.026177}, {0.998630,-0.052336},
> +{0.996917,-0.078459}, {0.994522,-0.104528}, {0.991445,-0.130526},
> +{0.987688,-0.156434}, {0.983255,-0.182236}, {0.978148,-0.207912},
> +{0.972370,-0.233445}, {0.965926,-0.258819}, {0.958820,-0.284015},
> +{0.951057,-0.309017}, {0.942641,-0.333807}, {0.933580,-0.358368},
> +{0.923880,-0.382683}, {0.913545,-0.406737}, {0.902585,-0.430511},
> +{0.891007,-0.453991}, {0.878817,-0.477159}, {0.866025,-0.500000},
> +{0.852640,-0.522499}, {0.838671,-0.544639}, {0.824126,-0.566406},
> +{0.809017,-0.587785}, {0.793353,-0.608761}, {0.777146,-0.629320},
> +{0.760406,-0.649448}, {0.743145,-0.669131}, {0.725374,-0.688355},
> +{0.707107,-0.707107}, {0.688355,-0.725374}, {0.669131,-0.743145},
> +{0.649448,-0.760406}, {0.629320,-0.777146}, {0.608761,-0.793353},
> +{0.587785,-0.809017}, {0.566406,-0.824126}, {0.544639,-0.838671},
> +{0.522498,-0.852640}, {0.500000,-0.866025}, {0.477159,-0.878817},
> +{0.453991,-0.891007}, {0.430511,-0.902585}, {0.406737,-0.913545},
> +{0.382683,-0.923880}, {0.358368,-0.933580}, {0.333807,-0.942641},
> +{0.309017,-0.951057}, {0.284015,-0.958820}, {0.258819,-0.965926},
> +{0.233445,-0.972370}, {0.207912,-0.978148}, {0.182236,-0.983255},
> +{0.156434,-0.987688}, {0.130526,-0.991445}, {0.104528,-0.994522},
> +{0.078459,-0.996917}, {0.052336,-0.998630}, {0.026177,-0.999657},
> +{1.000000,-0.000000}, {0.998630,-0.052336}, {0.994522,-0.104528},
> +{0.987688,-0.156434}, {0.978148,-0.207912}, {0.965926,-0.258819},
> +{0.951057,-0.309017}, {0.933580,-0.358368}, {0.913545,-0.406737},
> +{0.891007,-0.453991}, {0.866025,-0.500000}, {0.838671,-0.544639},
> +{0.809017,-0.587785}, {0.777146,-0.629320}, {0.743145,-0.669131},
> +{0.707107,-0.707107}, {0.669131,-0.743145}, {0.629320,-0.777146},
> +{0.587785,-0.809017}, {0.544639,-0.838671}, {0.500000,-0.866025},
> +{0.453991,-0.891007}, {0.406737,-0.913545}, {0.358368,-0.933580},
> +{0.309017,-0.951057}, {0.258819,-0.965926}, {0.207912,-0.978148},
> +{0.156434,-0.987688}, {0.104528,-0.994522}, {0.052336,-0.998630},
> +{-0.000000,-1.000000}, {-0.052336,-0.998630}, {-0.104529,-0.994522},
> +{-0.156434,-0.987688}, {-0.207912,-0.978148}, {-0.258819,-0.965926},
> +{-0.309017,-0.951056}, {-0.358368,-0.933580}, {-0.406737,-0.913545},
> +{-0.453991,-0.891006}, {-0.500000,-0.866025}, {-0.544639,-0.838671},
> +{-0.587785,-0.809017}, {-0.629321,-0.777146}, {-0.669131,-0.743145},
> +{-0.707107,-0.707107}, {-0.743145,-0.669130}, {-0.777146,-0.629320},
> +{-0.809017,-0.587785}, {-0.838671,-0.544639}, {-0.866025,-0.500000},
> +{-0.891007,-0.453990}, {-0.913545,-0.406737}, {-0.933580,-0.358368},
> +{-0.951057,-0.309017}, {-0.965926,-0.258819}, {-0.978148,-0.207912},
> +{-0.987688,-0.156434}, {-0.994522,-0.104528}, {-0.998630,-0.052336},
> +{1.000000,-0.000000}, {0.996917,-0.078459}, {0.987688,-0.156434},
> +{0.972370,-0.233445}, {0.951057,-0.309017}, {0.923880,-0.382683},
> +{0.891007,-0.453991}, {0.852640,-0.522499}, {0.809017,-0.587785},
> +{0.760406,-0.649448}, {0.707107,-0.707107}, {0.649448,-0.760406},
> +{0.587785,-0.809017}, {0.522498,-0.852640}, {0.453991,-0.891007},
> +{0.382683,-0.923880}, {0.309017,-0.951057}, {0.233445,-0.972370},
> +{0.156434,-0.987688}, {0.078459,-0.996917}, {-0.000000,-1.000000},
> +{-0.078459,-0.996917}, {-0.156434,-0.987688}, {-0.233445,-0.972370},
> +{-0.309017,-0.951056}, {-0.382684,-0.923880}, {-0.453991,-0.891006},
> +{-0.522499,-0.852640}, {-0.587785,-0.809017}, {-0.649448,-0.760406},
> +{-0.707107,-0.707107}, {-0.760406,-0.649448}, {-0.809017,-0.587785},
> +{-0.852640,-0.522498}, {-0.891007,-0.453990}, {-0.923880,-0.382683},
> +{-0.951057,-0.309017}, {-0.972370,-0.233445}, {-0.987688,-0.156434},
> +{-0.996917,-0.078459}, {-1.000000,0.000000}, {-0.996917,0.078459},
> +{-0.987688,0.156435}, {-0.972370,0.233446}, {-0.951057,0.309017},
> +{-0.923880,0.382683}, {-0.891007,0.453991}, {-0.852640,0.522499},
> +{-0.809017,0.587785}, {-0.760406,0.649448}, {-0.707107,0.707107},
> +{-0.649448,0.760406}, {-0.587785,0.809017}, {-0.522498,0.852640},
> +{-0.453990,0.891007}, {-0.382683,0.923880}, {-0.309017,0.951056},
> +{-0.233445,0.972370}, {-0.156435,0.987688}, {-0.078459,0.996917},
> +};
> +static const ne10_fft_cpx_float32_t ne10_twiddles_120[120] = {
> +{1.000000,0.000000}, {1.000000,-0.000000}, {1.000000,-0.000000},
> +{1.000000,-0.000000}, {0.913545,-0.406737}, {0.669131,-0.743145},
> +{1.000000,-0.000000}, {0.669131,-0.743145}, {-0.104529,-0.994522},
> +{1.000000,-0.000000}, {0.309017,-0.951057}, {-0.809017,-0.587785},
> +{1.000000,-0.000000}, {-0.104529,-0.994522}, {-0.978148,0.207912},
> +{1.000000,-0.000000}, {0.978148,-0.207912}, {0.913545,-0.406737},
> +{0.809017,-0.587785}, {0.669131,-0.743145}, {0.500000,-0.866025},
> +{0.309017,-0.951057}, {0.104528,-0.994522}, {-0.104529,-0.994522},
> +{-0.309017,-0.951056}, {-0.500000,-0.866025}, {-0.669131,-0.743145},
> +{-0.809017,-0.587785}, {-0.913545,-0.406737}, {-0.978148,-0.207912},
> +{1.000000,-0.000000}, {0.998630,-0.052336}, {0.994522,-0.104528},
> +{0.987688,-0.156434}, {0.978148,-0.207912}, {0.965926,-0.258819},
> +{0.951057,-0.309017}, {0.933580,-0.358368}, {0.913545,-0.406737},
> +{0.891007,-0.453991}, {0.866025,-0.500000}, {0.838671,-0.544639},
> +{0.809017,-0.587785}, {0.777146,-0.629320}, {0.743145,-0.669131},
> +{0.707107,-0.707107}, {0.669131,-0.743145}, {0.629320,-0.777146},
> +{0.587785,-0.809017}, {0.544639,-0.838671}, {0.500000,-0.866025},
> +{0.453991,-0.891007}, {0.406737,-0.913545}, {0.358368,-0.933580},
> +{0.309017,-0.951057}, {0.258819,-0.965926}, {0.207912,-0.978148},
> +{0.156434,-0.987688}, {0.104528,-0.994522}, {0.052336,-0.998630},
> +{1.000000,-0.000000}, {0.994522,-0.104528}, {0.978148,-0.207912},
> +{0.951057,-0.309017}, {0.913545,-0.406737}, {0.866025,-0.500000},
> +{0.809017,-0.587785}, {0.743145,-0.669131}, {0.669131,-0.743145},
> +{0.587785,-0.809017}, {0.500000,-0.866025}, {0.406737,-0.913545},
> +{0.309017,-0.951057}, {0.207912,-0.978148}, {0.104528,-0.994522},
> +{-0.000000,-1.000000}, {-0.104529,-0.994522}, {-0.207912,-0.978148},
> +{-0.309017,-0.951056}, {-0.406737,-0.913545}, {-0.500000,-0.866025},
> +{-0.587785,-0.809017}, {-0.669131,-0.743145}, {-0.743145,-0.669130},
> +{-0.809017,-0.587785}, {-0.866025,-0.500000}, {-0.913545,-0.406737},
> +{-0.951057,-0.309017}, {-0.978148,-0.207912}, {-0.994522,-0.104528},
> +{1.000000,-0.000000}, {0.987688,-0.156434}, {0.951057,-0.309017},
> +{0.891007,-0.453991}, {0.809017,-0.587785}, {0.707107,-0.707107},
> +{0.587785,-0.809017}, {0.453991,-0.891007}, {0.309017,-0.951057},
> +{0.156434,-0.987688}, {-0.000000,-1.000000}, {-0.156434,-0.987688},
> +{-0.309017,-0.951056}, {-0.453991,-0.891006}, {-0.587785,-0.809017},
> +{-0.707107,-0.707107}, {-0.809017,-0.587785}, {-0.891007,-0.453990},
> +{-0.951057,-0.309017}, {-0.987688,-0.156434}, {-1.000000,0.000000},
> +{-0.987688,0.156435}, {-0.951057,0.309017}, {-0.891007,0.453991},
> +{-0.809017,0.587785}, {-0.707107,0.707107}, {-0.587785,0.809017},
> +{-0.453990,0.891007}, {-0.309017,0.951056}, {-0.156435,0.987688},
> +};
> +static const ne10_fft_cpx_float32_t ne10_twiddles_60[60] = {
> +{1.000000,0.000000}, {1.000000,-0.000000}, {1.000000,-0.000000},
> +{1.000000,-0.000000}, {0.913545,-0.406737}, {0.669131,-0.743145},
> +{1.000000,-0.000000}, {0.669131,-0.743145}, {-0.104529,-0.994522},
> +{1.000000,-0.000000}, {0.309017,-0.951057}, {-0.809017,-0.587785},
> +{1.000000,-0.000000}, {-0.104529,-0.994522}, {-0.978148,0.207912},
> +{1.000000,-0.000000}, {0.994522,-0.104528}, {0.978148,-0.207912},
> +{0.951057,-0.309017}, {0.913545,-0.406737}, {0.866025,-0.500000},
> +{0.809017,-0.587785}, {0.743145,-0.669131}, {0.669131,-0.743145},
> +{0.587785,-0.809017}, {0.500000,-0.866025}, {0.406737,-0.913545},
> +{0.309017,-0.951057}, {0.207912,-0.978148}, {0.104528,-0.994522},
> +{1.000000,-0.000000}, {0.978148,-0.207912}, {0.913545,-0.406737},
> +{0.809017,-0.587785}, {0.669131,-0.743145}, {0.500000,-0.866025},
> +{0.309017,-0.951057}, {0.104528,-0.994522}, {-0.104529,-0.994522},
> +{-0.309017,-0.951056}, {-0.500000,-0.866025}, {-0.669131,-0.743145},
> +{-0.809017,-0.587785}, {-0.913545,-0.406737}, {-0.978148,-0.207912},
> +{1.000000,-0.000000}, {0.951057,-0.309017}, {0.809017,-0.587785},
> +{0.587785,-0.809017}, {0.309017,-0.951057}, {-0.000000,-1.000000},
> +{-0.309017,-0.951056}, {-0.587785,-0.809017}, {-0.809017,-0.587785},
> +{-0.951057,-0.309017}, {-1.000000,0.000000}, {-0.951057,0.309017},
> +{-0.809017,0.587785}, {-0.587785,0.809017}, {-0.309017,0.951056},
> +};
> +static const ne10_fft_state_float32_t cfg_arch_480 = {
> +120,
> +(ne10_int32_t *)ne10_factors_480,
> +(ne10_fft_cpx_float32_t *)ne10_twiddles_480,
> +NULL,
> +(ne10_fft_cpx_float32_t *)&ne10_twiddles_480[120],
> +};
> +static const ne10_fft_state_float32_t cfg_arch_240 = {
> +60,
> +(ne10_int32_t *)ne10_factors_240,
> +(ne10_fft_cpx_float32_t *)ne10_twiddles_240,
> +NULL,
> +(ne10_fft_cpx_float32_t *)&ne10_twiddles_240[60],
> +};
> +static const ne10_fft_state_float32_t cfg_arch_120 = {
> +30,
> +(ne10_int32_t *)ne10_factors_120,
> +(ne10_fft_cpx_float32_t *)ne10_twiddles_120,
> +NULL,
> +(ne10_fft_cpx_float32_t *)&ne10_twiddles_120[30],
> +};
> +static const ne10_fft_state_float32_t cfg_arch_60 = {
> +15,
> +(ne10_int32_t *)ne10_factors_60,
> +(ne10_fft_cpx_float32_t *)ne10_twiddles_60,
> +NULL,
> +(ne10_fft_cpx_float32_t *)&ne10_twiddles_60[15],
> +};
> +#endif  /*end NE10_FFT_PARAMS48000_960*/
> diff --git a/celt/tests/test_unit_dft.c b/celt/tests/test_unit_dft.c
> index 57db0e3..8996f17 100644
> --- a/celt/tests/test_unit_dft.c
> +++ b/celt/tests/test_unit_dft.c
> @@ -45,6 +45,16 @@
>  #include "mathops.c"
>  #include "entcode.c"
>
> +#if defined(OPUS_HAVE_RTCD) && \
> +         (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
> +#include "arm/armcpu.c"
> +#if defined(HAVE_ARM_NE10)
> +#include "arm/celt_ne10_fft.c"
> +#include "arm/arm_celt_ne10_fft_map.c"
> +#endif
> +#elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
> +#include "x86/x86cpu.c"
> +#endif
>
>  #ifndef M_PI
>  #define M_PI 3.141592653
> @@ -125,7 +135,7 @@ void test1d(int nfft,int isinverse)
>      if (isinverse)
>         opus_ifft(cfg,in,out);
>      else
> -       opus_fft(cfg,in,out);
> +       opus_fft(cfg,in,out, opus_select_arch());
>
>      /*for (k=0;k<nfft;++k) printf("%d %d ", out[k].r, out[k].i);printf("\n");*/
>
> @@ -153,10 +163,12 @@ int main(int argc,char ** argv)
>          test1d(256,0);
>          test1d(256,1);
>  #ifndef RADIX_TWO_ONLY
> +#ifndef HAVE_ARM_NE10
>          test1d(36,0);
>          test1d(36,1);
>          test1d(50,0);
>          test1d(50,1);
> +#endif
>          test1d(120,0);
>          test1d(120,1);
>  #endif
> diff --git a/celt/tests/test_unit_mdct.c b/celt/tests/test_unit_mdct.c
> index ac8957f..950d824 100644
> --- a/celt/tests/test_unit_mdct.c
> +++ b/celt/tests/test_unit_mdct.c
> @@ -46,6 +46,19 @@
>  #include "mathops.c"
>  #include "entcode.c"
>
> +#if defined(OPUS_HAVE_RTCD) && \
> +         (defined(OPUS_ARM_ASM) || defined(OPUS_ARM_NEON_INTR))
> +#include "arm/armcpu.c"
> +#if defined(HAVE_ARM_NE10)
> +#include "arm/celt_ne10_fft.c"
> +#include "arm/celt_ne10_mdct.c"
> +#include "arm/arm_celt_ne10_fft_map.c"
> +#include "arm/arm_celt_ne10_mdct_map.c"
> +#endif
> +#elif defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1)
> +#include "x86/x86cpu.c"
> +#endif
> +
>  #ifndef M_PI
>  #define M_PI 3.141592653
>  #endif
> @@ -156,7 +169,7 @@ void test1d(int nfft,int isinverse)
>            out[nfft-k-1] = out[nfft/2+k];
>         check_inv(in,out,nfft,isinverse);
>      } else {
> -       clt_mdct_forward(&cfg,in,out,window, nfft/2, 0, 1);
> +       clt_mdct_forward(&cfg,in,out,window, nfft/2, 0, 1, opus_select_arch());
>         check(in_copy,out,nfft,isinverse);
>      }
>      /*for (k=0;k<nfft;++k) printf("%d %d ", out[k].r, out[k].i);printf("\n");*/
> @@ -188,6 +201,9 @@ int main(int argc,char ** argv)
>          test1d(2048,0);
>          test1d(2048,1);
>  #ifndef RADIX_TWO_ONLY
> +
> +/* ARM NE10 library does not support below values */
> +#ifndef HAVE_ARM_NE10
>          test1d(36,0);
>          test1d(36,1);
>          test1d(40,0);
> @@ -196,6 +212,7 @@ int main(int argc,char ** argv)
>          test1d(60,1);
>          test1d(120,0);
>          test1d(120,1);
> +#endif
>          test1d(240,0);
>          test1d(240,1);
>          test1d(480,0);
> diff --git a/celt_headers.mk b/celt_headers.mk
> index 5bb193e..c51c3ee 100644
> --- a/celt_headers.mk
> +++ b/celt_headers.mk
> @@ -31,11 +31,14 @@ celt/stack_alloc.h \
>  celt/vq.h \
>  celt/static_modes_float.h \
>  celt/static_modes_fixed.h \
> +celt/static_modes_float_arm_ne10.h \
>  celt/arm/armcpu.h \
>  celt/arm/fixed_armv4.h \
>  celt/arm/fixed_armv5e.h \
>  celt/arm/kiss_fft_armv4.h \
>  celt/arm/kiss_fft_armv5e.h \
>  celt/arm/pitch_arm.h \
> +celt/arm/fft_arm.h \
> +celt/arm/mdct_arm.h \
>  celt/x86/pitch_sse.h \
>  celt/x86/x86cpu.h
> diff --git a/celt_sources.mk b/celt_sources.mk
> index 29ec937..28c7bae 100644
> --- a/celt_sources.mk
> +++ b/celt_sources.mk
> @@ -35,3 +35,9 @@ celt/arm/armopts.s.in
>
>  CELT_SOURCES_ARM_NEON_INTR = \
>  celt/arm/celt_neon_intr.c
> +
> +CELT_SOURCES_ARM_NE10= \
> +celt/arm/celt_ne10_fft.c \
> +celt/arm/celt_ne10_mdct.c \
> +celt/arm/arm_celt_ne10_fft_map.c \
> +celt/arm/arm_celt_ne10_mdct_map.c
> diff --git a/configure.ac b/configure.ac
> index 87cece9..baa3425 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -351,6 +351,80 @@ AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM],
>  AM_CONDITIONAL([HAVE_SSE4_1], [false])
>  AM_CONDITIONAL([HAVE_SSE2], [false])
>
> +AC_DEFUN([OPUS_PATH_NE10],
> +   [
> +      AC_ARG_WITH(NE10,
> +                  AC_HELP_STRING([--with-NE10=PFX],[Prefix where libNE10 is installed (optional)]),
> +                  NE10_prefix="$withval", NE10_prefix="")
> +      AC_ARG_WITH(NE10-libraries,
> +                  AC_HELP_STRING([--with-NE10-libraries=DIR],
> +                        [Directory where libNE10 library is installed (optional)]),
> +                  NE10_libraries="$withval", NE10_libraries="")
> +      AC_ARG_WITH(NE10-includes,
> +                  AC_HELP_STRING([--with-NE10-includes=DIR],
> +                                 [Directory where libNE10 header files are installed (optional)]),
> +                  NE10_includes="$withval", ogg_includes="")
> +
> +      if test "x$NE10_libraries" != "x" ; then
> +         NE10_LIBS="-L$NE10_libraries"
> +      elif test "x$NE10_prefix" = "xno" || test "x$NE10_prefix" = "xyes" ; then
> +         NE10_LIBS=""
> +      elif test "x$NE10_prefix" != "x" ; then
> +         NE10_LIBS="-L$NE10_prefix/lib"
> +      elif test "x$prefix" != "xNONE" ; then
> +         NE10_LIBS="-L$prefix/lib"
> +      fi
> +
> +      if test "x$NE10_prefix" != "xno" ; then
> +         NE10_LIBS="$NE10_LIBS -lNE10"
> +      fi
> +
> +      if test "x$NE10_includes" != "x" ; then
> +         NE10_CFLAGS="-I$NE10_includes"
> +      elif test "x$NE10_prefix" = "xno" || test "x$NE10_prefix" = "xyes" ; then
> +         NE10_CFLAGS=""
> +      elif test "x$ogg_prefix" != "x" ; then
> +         NE10_CFLAGS="-I$NE10_prefix/include"
> +      elif test "x$prefix" != "xNONE"; then
> +         NE10_CFLAGS="-I$prefix/include"
> +      fi
> +
> +      AC_MSG_CHECKING(for NE10)
> +      save_CFLAGS="$CFLAGS"; CFLAGS="$NE10_CFLAGS"
> +      save_LIBS="$LIBS"; LIBS="$NE10_LIBS"
> +      AC_LINK_IFELSE(
> +         [
> +            AC_LANG_PROGRAM(
> +               [[#include <NE10_init.h>
> +               ]],
> +               [[
> +                  ne10_fft_cfg_float32_t cfg;
> +                  cfg = ne10_fft_alloc_c2c_float32_neon(480);
> +               ]]
> +            )
> +         ],[
> +            HAVE_ARM_NE10=1
> +            AC_MSG_RESULT([yes])
> +         ],[
> +            HAVE_ARM_NE10=0
> +            AC_MSG_RESULT([no])
> +            NE10_CFLAGS=""
> +            NE10_LIBS=""
> +         ]
> +      )
> +      CFLAGS="$save_CFLAGS"; LIBS="$save_LIBS"
> +      #Now we know if libNE10 is installed or not
> +      AS_IF([test x"$HAVE_ARM_NE10" = x"1"],
> +         [
> +            AC_DEFINE([HAVE_ARM_NE10], 1, [NE10 library is installed on host. Make sure it is on target!])
> +            AC_SUBST(HAVE_ARM_NE10)
> +            AC_SUBST(NE10_CFLAGS)
> +            AC_SUBST(NE10_LIBS)
> +         ],[]
> +      )
> +   ]
> +)
> +
>  AS_IF([test x"$enable_intrinsics" = x"yes"],[
>     case $host_cpu in
>     arm*)
> @@ -391,6 +465,10 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[
>              AC_DEFINE([OPUS_ARM_MAY_HAVE_EDSP], 1, [Define if compiler support EDSP Instructions])
>              AC_DEFINE([OPUS_ARM_MAY_HAVE_MEDIA], 1, [Define if compiler support MEDIA Instructions])
>              AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON], 1, [Define if compiler support NEON instructions])
> +
> +            OPUS_PATH_NE10()
> +            AS_IF([test x"$NE10_LIBS" != "x"],
> +                  [enable_intrinsics="$enable_intrinsics NE10"],[])
>           ],
>           [
>              AC_MSG_WARN([Compiler does not support ARM intrinsics])
> @@ -516,6 +594,9 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[
>  AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
>  AM_CONDITIONAL([OPUS_ARM_NEON_INTR],
>      [test x"$OPUS_ARM_NEON_INTR" = x"1"])
> +AM_CONDITIONAL([HAVE_ARM_NE10],
> +    [test x"$HAVE_ARM_NE10" = x"1"])
> +
>
>  AS_IF([test x"$enable_rtcd" = x"yes"],[
>      AS_IF([test x"$rtcd_support" != x"no"],[
> diff --git a/src/analysis.c b/src/analysis.c
> index 2ee8533..0603643 100644
> --- a/src/analysis.c
> +++ b/src/analysis.c
> @@ -262,7 +262,7 @@ static void tonality_analysis(TonalityAnalysisState *tonal, const CELTMode *celt
>      remaining = len - (ANALYSIS_BUF_SIZE-tonal->mem_fill);
>      downmix(x, &tonal->inmem[240], remaining, offset+ANALYSIS_BUF_SIZE-tonal->mem_fill, c1, c2, C);
>      tonal->mem_fill = 240 + remaining;
> -    opus_fft(kfft, in, out);
> +    opus_fft(kfft, in, out, opus_select_arch());
>  #ifndef FIXED_POINT
>      /* If there's any NaN on the input, the entire output will be NaN, so we only need to check one value. */
>      if (celt_isnan(out[0].r))
> diff --git a/src/opus_multistream_encoder.c b/src/opus_multistream_encoder.c
> index 6e87337..05744bf 100644
> --- a/src/opus_multistream_encoder.c
> +++ b/src/opus_multistream_encoder.c
> @@ -257,7 +257,8 @@ void surround_analysis(const CELTMode *celt_mode, const void *pcm, opus_val16 *b
>        OPUS_COPY(in, mem+c*overlap, overlap);
>        (*copy_channel_in)(x, 1, pcm, channels, c, len);
>        celt_preemphasis(x, in+overlap, frame_size, 1, upsample, celt_mode->preemph, preemph_mem+c, 0);
> -      clt_mdct_forward(&celt_mode->mdct, in, freq, celt_mode->window, overlap, celt_mode->maxLM-LM, 1);
> +      clt_mdct_forward(&celt_mode->mdct, in, freq, celt_mode->window,
> +                        overlap, celt_mode->maxLM-LM, 1, opus_select_arch());
>        if (upsample != 1)
>        {
>           int bound = len;
> --
> 1.7.9.5
>


More information about the opus mailing list