[opus] AVX Optimizations

Timothy B. Terriberry tterribe at xiph.org
Thu Nov 5 01:08:14 PST 2015


Velea, Radu wrote:
> Yes,
>
> Thank you. I'll follow up with the AVX code and tests for pitch code.

Actually, I lied. Because you update opus_select_arch(), you can now 
return a value for arch (4) that is larger than the maximum we currently 
support (3). This doesn't actually cause failures, because we mask with 
OPUS_ARCHMASK, but it does mean that a CPU with AVX will invoke the C 
versions of every function instead of any potential SSE, SSE2, or SSE4.1 
versions. It only works this well because the current maximum arch index 
happens to be one less than a power of two.

Here's a patch that fixes the problem. If this looks okay to you, I'll 
amend your second commit to include these changes.

diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index db1cb58..133abbf 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -43,23 +43,24 @@
   */
  #define OPUS_ARCHMASK 3

  #elif (defined(OPUS_X86_MAY_HAVE_SSE) && 
!defined(OPUS_X86_PRESUME_SSE)) || \
    (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2)) 
|| \
    (defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1))

  #include "x86/x86cpu.h"
-/* We currently support 4 x86 variants:
+/* We currently support 5 x86 variants:
   * arch[0] -> non-sse
   * arch[1] -> sse
   * arch[2] -> sse2
   * arch[3] -> sse4.1
+ * arch[4] -> avx
   */
-#define OPUS_ARCHMASK 3
+#define OPUS_ARCHMASK 7
  int opus_select_arch(void);

  #else
  #define OPUS_ARCHMASK 0

  static OPUS_INLINE int opus_select_arch(void)
  {
    return 0;
diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c
index 1ed2acb..8e5e449 100644
--- a/celt/x86/x86_celt_map.c
+++ b/celt/x86/x86_celt_map.c
@@ -48,44 +48,47 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
           int              ord,
           opus_val16       *mem,
           int              arch
  ) = {
    celt_fir_c,                /* non-sse */
    celt_fir_c,
    celt_fir_c,
    MAY_HAVE_SSE4_1(celt_fir), /* sse4.1  */
+  MAY_HAVE_SSE4_1(celt_fir)  /* avx  */
  };

  void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
           const opus_val16 *x,
           const opus_val16 *y,
           opus_val32       sum[4],
           int              len
  ) = {
    xcorr_kernel_c,                /* non-sse */
    xcorr_kernel_c,
    xcorr_kernel_c,
    MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1  */
+  MAY_HAVE_SSE4_1(xcorr_kernel)  /* avx  */
  };

  #endif

  #if (defined(OPUS_X86_MAY_HAVE_SSE4_1) && 
!defined(OPUS_X86_PRESUME_SSE4_1)) ||  \
  	(!defined(OPUS_X86_MAY_HAVE_SSE_4_1) && 
defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2))

  opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
           const opus_val16 *x,
           const opus_val16 *y,
           int              N
  ) = {
    celt_inner_prod_c,                /* non-sse */
    celt_inner_prod_c,
    MAY_HAVE_SSE2(celt_inner_prod),
    MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1  */
+  MAY_HAVE_SSE4_1(celt_inner_prod)  /* avx  */
  };

  #endif

  # else

  #if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)

@@ -94,55 +97,59 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
           const opus_val16 *y,
           opus_val32       sum[4],
           int              len
  ) = {
    xcorr_kernel_c,                /* non-sse */
    MAY_HAVE_SSE(xcorr_kernel),
    MAY_HAVE_SSE(xcorr_kernel),
    MAY_HAVE_SSE(xcorr_kernel),
+  MAY_HAVE_SSE(xcorr_kernel)
  };

  opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
           const opus_val16 *x,
           const opus_val16 *y,
           int              N
  ) = {
    celt_inner_prod_c,                /* non-sse */
    MAY_HAVE_SSE(celt_inner_prod),
    MAY_HAVE_SSE(celt_inner_prod),
    MAY_HAVE_SSE(celt_inner_prod),
+  MAY_HAVE_SSE(celt_inner_prod)
  };

  void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
                      const opus_val16 *x,
                      const opus_val16 *y01,
                      const opus_val16 *y02,
                      int               N,
                      opus_val32       *xy1,
                      opus_val32       *xy2
  ) = {
    dual_inner_prod_c,                /* non-sse */
    MAY_HAVE_SSE(dual_inner_prod),
    MAY_HAVE_SSE(dual_inner_prod),
    MAY_HAVE_SSE(dual_inner_prod),
+  MAY_HAVE_SSE(dual_inner_prod)
  };

  void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
                opus_val32 *y,
                opus_val32 *x,
                int         T,
                int         N,
                opus_val16  g10,
                opus_val16  g11,
                opus_val16  g12
  ) = {
    comb_filter_const_c,                /* non-sse */
    MAY_HAVE_SSE(comb_filter_const),
    MAY_HAVE_SSE(comb_filter_const),
    MAY_HAVE_SSE(comb_filter_const),
+  MAY_HAVE_SSE(comb_filter_const)
  };


  #endif

  #endif
  #endif


More information about the opus mailing list