[opus] AVX Optimizations
Timothy B. Terriberry
tterribe at xiph.org
Thu Nov 5 01:08:14 PST 2015
Velea, Radu wrote:
> Yes,
>
> Thank you. I'll follow up with the AVX code and tests for pitch code.
Actually, I lied. Because you update opus_select_arch(), you can now
return a value for arch (4) that is larger than the maximum we currently
support (3). This doesn't actually cause failures, because we mask with
OPUS_ARCHMASK, but it does mean that a CPU with AVX will invoke the C
versions of every function instead of any potential SSE, SSE2, or SSE4.1
versions. It only works this well because the current maximum arch index
happens to be one less than a power of two.
Here's a patch that fixes the problem. If this looks okay to you, I'll
amend your second commit to include these changes.
diff --git a/celt/cpu_support.h b/celt/cpu_support.h
index db1cb58..133abbf 100644
--- a/celt/cpu_support.h
+++ b/celt/cpu_support.h
@@ -43,23 +43,24 @@
*/
#define OPUS_ARCHMASK 3
#elif (defined(OPUS_X86_MAY_HAVE_SSE) &&
!defined(OPUS_X86_PRESUME_SSE)) || \
(defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2))
|| \
(defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_PRESUME_SSE4_1))
#include "x86/x86cpu.h"
-/* We currently support 4 x86 variants:
+/* We currently support 5 x86 variants:
* arch[0] -> non-sse
* arch[1] -> sse
* arch[2] -> sse2
* arch[3] -> sse4.1
+ * arch[4] -> avx
*/
-#define OPUS_ARCHMASK 3
+#define OPUS_ARCHMASK 7
int opus_select_arch(void);
#else
#define OPUS_ARCHMASK 0
static OPUS_INLINE int opus_select_arch(void)
{
return 0;
diff --git a/celt/x86/x86_celt_map.c b/celt/x86/x86_celt_map.c
index 1ed2acb..8e5e449 100644
--- a/celt/x86/x86_celt_map.c
+++ b/celt/x86/x86_celt_map.c
@@ -48,44 +48,47 @@ void (*const CELT_FIR_IMPL[OPUS_ARCHMASK + 1])(
int ord,
opus_val16 *mem,
int arch
) = {
celt_fir_c, /* non-sse */
celt_fir_c,
celt_fir_c,
MAY_HAVE_SSE4_1(celt_fir), /* sse4.1 */
+ MAY_HAVE_SSE4_1(celt_fir) /* avx */
};
void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y,
opus_val32 sum[4],
int len
) = {
xcorr_kernel_c, /* non-sse */
xcorr_kernel_c,
xcorr_kernel_c,
MAY_HAVE_SSE4_1(xcorr_kernel), /* sse4.1 */
+ MAY_HAVE_SSE4_1(xcorr_kernel) /* avx */
};
#endif
#if (defined(OPUS_X86_MAY_HAVE_SSE4_1) &&
!defined(OPUS_X86_PRESUME_SSE4_1)) || \
(!defined(OPUS_X86_MAY_HAVE_SSE_4_1) &&
defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_SSE2))
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y,
int N
) = {
celt_inner_prod_c, /* non-sse */
celt_inner_prod_c,
MAY_HAVE_SSE2(celt_inner_prod),
MAY_HAVE_SSE4_1(celt_inner_prod), /* sse4.1 */
+ MAY_HAVE_SSE4_1(celt_inner_prod) /* avx */
};
#endif
# else
#if defined(OPUS_X86_MAY_HAVE_SSE) && !defined(OPUS_X86_PRESUME_SSE)
@@ -94,55 +97,59 @@ void (*const XCORR_KERNEL_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *y,
opus_val32 sum[4],
int len
) = {
xcorr_kernel_c, /* non-sse */
MAY_HAVE_SSE(xcorr_kernel),
MAY_HAVE_SSE(xcorr_kernel),
MAY_HAVE_SSE(xcorr_kernel),
+ MAY_HAVE_SSE(xcorr_kernel)
};
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y,
int N
) = {
celt_inner_prod_c, /* non-sse */
MAY_HAVE_SSE(celt_inner_prod),
MAY_HAVE_SSE(celt_inner_prod),
MAY_HAVE_SSE(celt_inner_prod),
+ MAY_HAVE_SSE(celt_inner_prod)
};
void (*const DUAL_INNER_PROD_IMPL[OPUS_ARCHMASK + 1])(
const opus_val16 *x,
const opus_val16 *y01,
const opus_val16 *y02,
int N,
opus_val32 *xy1,
opus_val32 *xy2
) = {
dual_inner_prod_c, /* non-sse */
MAY_HAVE_SSE(dual_inner_prod),
MAY_HAVE_SSE(dual_inner_prod),
MAY_HAVE_SSE(dual_inner_prod),
+ MAY_HAVE_SSE(dual_inner_prod)
};
void (*const COMB_FILTER_CONST_IMPL[OPUS_ARCHMASK + 1])(
opus_val32 *y,
opus_val32 *x,
int T,
int N,
opus_val16 g10,
opus_val16 g11,
opus_val16 g12
) = {
comb_filter_const_c, /* non-sse */
MAY_HAVE_SSE(comb_filter_const),
MAY_HAVE_SSE(comb_filter_const),
MAY_HAVE_SSE(comb_filter_const),
+ MAY_HAVE_SSE(comb_filter_const)
};
#endif
#endif
#endif
More information about the opus
mailing list