[opus] [RFC PATCH v1 5/8] aarch64: celt_pitch_xcorr: Fixed point intrinsics
Viswanath Puttagunta
viswanath.puttagunta at linaro.org
Tue Apr 28 15:24:53 PDT 2015
Optimize celt_pitch_xcorr function (for fixed point).
Even though same code in theory should work for ARMv7
as well, turning this on only for aarch64 at the moment since
there is a fixed point asm implementation for ARMv7 neon.
Signed-off-by: Viswanath Puttagunta <viswanath.puttagunta at linaro.org>
---
celt/arm/celt_neon_intr.c | 268 ++++++++++++++++++++++++++++++++++++++++++++++
celt/arm/pitch_arm.h | 10 ++
configure.ac | 6 ++
3 files changed, 284 insertions(+)
diff --git a/celt/arm/celt_neon_intr.c b/celt/arm/celt_neon_intr.c
index 47dce15..be978a0 100644
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@@ -249,4 +249,272 @@ void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
(const float32_t *)_y+i, (float32_t *)xcorr+i, len);
}
}
+#else /* FIXED POINT */
+
+/*
+ * Function: xcorr_kernel_neon_fixed
+ * ---------------------------------
+ * Computes 8 correlation values and stores them in sum[8]
+ */
+static void xcorr_kernel_neon_fixed(const int16_t *x, const int16_t *y,
+ int32_t sum[4], int len) {
+ int16x8_t YY[3];
+ int16x4_t YEXT[3];
+ int16x8_t XX[2];
+ int16x4_t XX_2, YY_2;
+ int32x4_t SUMM;
+ const int16_t *xi = x;
+ const int16_t *yi = y;
+
+ celt_assert(len>4);
+
+ YY[0] = vld1q_s16(yi);
+ YY_2 = vget_low_s16(YY[0]);
+
+ SUMM = vdupq_n_s32(0);
+
+ /* Consume 16 elements in x vector and 20 elements in y
+ * vector. However, the y[19] and beyond dont get accessed
+ * So, if len == 16, then we must only access y[0] to y[18]
+ * So, make sure len > 19
+ */
+ while (len > 19) {
+ yi += 8;
+ YY[1] = vld1q_s16(yi);
+ yi += 8;
+ YY[2] = vld1q_s16(yi);
+
+ XX[0] = vld1q_s16(xi);
+ xi += 8;
+ XX[1] = vld1q_s16(xi);
+ xi += 8;
+
+ /* Consume XX[0][0:3] */
+ SUMM = vmlal_lane_s16(SUMM, vget_low_s16(YY[0]), vget_low_s16(XX[0]), 0);
+
+ YEXT[0] = vext_s16(vget_low_s16(YY[0]), vget_high_s16(YY[0]), 1);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[0], vget_low_s16(XX[0]), 1);
+
+ YEXT[1] = vext_s16(vget_low_s16(YY[0]), vget_high_s16(YY[0]), 2);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[1], vget_low_s16(XX[0]), 2);
+
+ YEXT[2] = vext_s16(vget_low_s16(YY[0]), vget_high_s16(YY[0]), 3);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[2], vget_low_s16(XX[0]), 3);
+
+ /* Consume XX[0][7:4] */
+ SUMM = vmlal_lane_s16(SUMM, vget_high_s16(YY[0]), vget_high_s16(XX[0]), 0);
+
+ YEXT[0] = vext_s16(vget_high_s16(YY[0]), vget_low_s16(YY[1]), 1);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[0], vget_high_s16(XX[0]), 1);
+
+ YEXT[1] = vext_s16(vget_high_s16(YY[0]), vget_low_s16(YY[1]), 2);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[1], vget_high_s16(XX[0]), 2);
+
+ YEXT[2] = vext_s16(vget_high_s16(YY[0]), vget_low_s16(YY[1]), 3);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[2], vget_high_s16(XX[0]), 3);
+
+ /* Consume XX[1][3:0]*/
+ SUMM = vmlal_lane_s16(SUMM, vget_low_s16(YY[1]), vget_low_s16(XX[1]), 0);
+
+ YEXT[0] = vext_s16(vget_low_s16(YY[1]), vget_high_s16(YY[1]), 1);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[0], vget_low_s16(XX[1]), 1);
+
+ YEXT[1] = vext_s16(vget_low_s16(YY[1]), vget_high_s16(YY[1]), 2);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[1], vget_low_s16(XX[1]), 2);
+
+ YEXT[2] = vext_s16(vget_low_s16(YY[1]), vget_high_s16(YY[1]), 3);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[2], vget_low_s16(XX[1]), 3);
+
+ /* Consume XX[1][7:4] */
+ SUMM = vmlal_lane_s16(SUMM, vget_high_s16(YY[1]), vget_high_s16(XX[1]), 0);
+
+ YEXT[0] = vext_s16(vget_high_s16(YY[1]), vget_low_s16(YY[2]), 1);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[0], vget_high_s16(XX[1]), 1);
+
+ YEXT[1] = vext_s16(vget_high_s16(YY[1]), vget_low_s16(YY[2]), 2);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[1], vget_high_s16(XX[1]), 2);
+
+ YEXT[2] = vext_s16(vget_high_s16(YY[1]), vget_low_s16(YY[2]), 3);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[2], vget_high_s16(XX[1]), 3);
+
+ YY[0] = YY[2];
+ len -= 16;
+ }
+
+ /* Consume 8 elements in x vector and 16 elements in y
+ * vector. However, y[15:11] should not be accessed unless
+ * len is > 11
+ */
+ if (len > 11) {
+ yi += 8;
+ YY[1] = vld1q_s16(yi);
+
+ XX[0] = vld1q_s16(xi);
+ xi += 8;
+
+ /* Consume XX[0][0:3] */
+ SUMM = vmlal_lane_s16(SUMM, vget_low_s16(YY[0]), vget_low_s16(XX[0]), 0);
+
+ YEXT[0] = vext_s16(vget_low_s16(YY[0]), vget_high_s16(YY[0]), 1);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[0], vget_low_s16(XX[0]), 1);
+
+ YEXT[1] = vext_s16(vget_low_s16(YY[0]), vget_high_s16(YY[0]), 2);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[1], vget_low_s16(XX[0]), 2);
+
+ YEXT[2] = vext_s16(vget_low_s16(YY[0]), vget_high_s16(YY[0]), 3);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[2], vget_low_s16(XX[0]), 3);
+
+ /* Consume XX[0][7:4] */
+ SUMM = vmlal_lane_s16(SUMM, vget_high_s16(YY[0]), vget_high_s16(XX[0]), 0);
+
+ YEXT[0] = vext_s16(vget_high_s16(YY[0]), vget_low_s16(YY[1]), 1);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[0], vget_high_s16(XX[0]), 1);
+
+ YEXT[1] = vext_s16(vget_high_s16(YY[0]), vget_low_s16(YY[1]), 2);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[1], vget_high_s16(XX[0]), 2);
+
+ YEXT[2] = vext_s16(vget_high_s16(YY[0]), vget_low_s16(YY[1]), 3);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[2], vget_high_s16(XX[0]), 3);
+
+ YY[0] = YY[1];
+ len -= 8;
+ }
+
+ /* Consume 4 elements in x vector and 8 elements in y vector
+ * However, y[7] should not be accessed unless len > 4
+ */
+ if (len > 4) {
+ XX_2 = vld1_s16(xi);
+ xi += 4;
+ /* Consume XX_2[0:3] */
+ SUMM = vmlal_lane_s16(SUMM, vget_low_s16(YY[0]), XX_2, 0);
+
+ YEXT[0] = vext_s16(vget_low_s16(YY[0]), vget_high_s16(YY[0]), 1);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[0], XX_2, 1);
+
+ YEXT[1] = vext_s16(vget_low_s16(YY[0]), vget_high_s16(YY[0]), 2);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[1], XX_2, 2);
+
+ YEXT[2] = vext_s16(vget_low_s16(YY[0]), vget_high_s16(YY[0]), 3);
+ SUMM = vmlal_lane_s16(SUMM, YEXT[2], XX_2, 3);
+
+ YY_2 = vget_high_s16(YY[0]);
+ len -= 4;
+ }
+
+ while (--len > 0) {
+ XX_2 = vld1_dup_s16(xi++);
+ SUMM = vmlal_lane_s16(SUMM, YY_2, XX_2, 0);
+ YY_2= vld1_s16(++yi);
+ }
+
+ XX_2 = vld1_dup_s16(xi);
+ SUMM = vmlal_lane_s16(SUMM, YY_2, XX_2, 0);
+
+ vst1q_s32(sum, SUMM);
+}
+
+/*
+ * Function: xcorr_kernel_neon_fixed_process1
+ * ---------------------------------
+ * Computes single correlation values and stores in *sum
+ */
+static void xcorr_kernel_neon_fixed_process1(const int16_t *x,
+ const int16_t *y,
+ int32_t *sum, int len) {
+ int16x8_t XX[2];
+ int16x8_t YY[2];
+
+ int16x4_t XX_2;
+ int16x4_t YY_2;
+
+ int32x4_t SUMM;
+ int32x2_t SUMM_2;
+ const int16_t *xi = x;
+ const int16_t *yi = y;
+
+ SUMM = vdupq_n_s32(0);
+
+ /* Work on 16 values per iteration */
+ while (len >= 16) {
+ XX[0] = vld1q_s16(xi);
+ xi += 8;
+ XX[1] = vld1q_s16(xi);
+ xi += 8;
+
+ YY[0] = vld1q_s16(yi);
+ yi += 8;
+ YY[1] = vld1q_s16(yi);
+ yi += 8;
+
+ SUMM = vmlal_s16(SUMM, vget_low_s16(YY[0]), vget_low_s16(XX[0]));
+ SUMM = vmlal_s16(SUMM, vget_high_s16(YY[0]), vget_high_s16(XX[0]));
+ SUMM = vmlal_s16(SUMM, vget_low_s16(YY[1]), vget_low_s16(XX[1]));
+ SUMM = vmlal_s16(SUMM, vget_high_s16(YY[1]), vget_high_s16(XX[1]));
+
+ len -= 16;
+ }
+
+ /* Work on 8 values */
+ if (len >= 8) {
+ XX[0] = vld1q_s16(xi);
+ xi += 8;
+
+ YY[0] = vld1q_s16(yi);
+ yi += 8;
+
+ SUMM = vmlal_s16(SUMM, vget_low_s16(YY[0]), vget_low_s16(XX[0]));
+ SUMM = vmlal_s16(SUMM, vget_high_s16(YY[0]), vget_high_s16(XX[0]));
+ len -= 8;
+ }
+
+ /* Work on 4 values */
+ if (len >= 4) {
+ XX_2 = vld1_s16(xi);
+ xi += 4;
+ YY_2 = vld1_s16(yi);
+ yi += 4;
+ SUMM = vmlal_s16(SUMM, YY_2, XX_2);
+ len -= 4;
+ }
+
+ SUMM_2 = vadd_s32(vget_high_s32(SUMM), vget_low_s32(SUMM));
+ SUMM_2 = vpadd_s32(SUMM_2, SUMM_2);
+ SUMM = vcombine_s32(SUMM_2, SUMM_2);
+
+ while (len > 0) {
+ XX_2 = vld1_dup_s16(xi++);
+ YY_2 = vld1_dup_s16(yi++);
+ SUMM = vmlal_s16(SUMM, XX_2, YY_2);
+ len--;
+ }
+ vst1q_lane_s32(sum, SUMM, 0);
+}
+
+opus_val32 celt_pitch_xcorr_fixed_neon(const opus_val16 *_x, const opus_val16 *_y,
+ opus_val32 *xcorr, int len, int max_pitch) {
+ int i;
+ celt_assert(max_pitch > 0);
+ celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
+ int max_corr = 1;
+
+ for (i = 0; i < (max_pitch-3); i += 4) {
+ xcorr_kernel_neon_fixed((const int16_t *)_x, (const int16_t *)_y+i,
+ (int32_t *)xcorr+i, len);
+ }
+
+ /* In case max_pitch isn't multiple of 4
+ * compute single correlation value per iteration
+ */
+ for (; i < max_pitch; i++) {
+ xcorr_kernel_neon_fixed_process1((const int16_t *)_x,
+ (const int16_t *)_y+i,
+ (int32_t *)xcorr+i, len);
+ }
+
+ for (i = 0; i < len; i++) {
+ max_corr = (max_corr > xcorr[i])? max_corr: xcorr[i];
+ }
+ return max_corr;
+}
#endif
diff --git a/celt/arm/pitch_arm.h b/celt/arm/pitch_arm.h
index 344186b..d5c9408 100644
--- a/celt/arm/pitch_arm.h
+++ b/celt/arm/pitch_arm.h
@@ -32,6 +32,15 @@
# if defined(FIXED_POINT)
+#if defined(CPU_AARCH64)
+#define OVERRIDE_PITCH_XCORR (1)
+opus_val32 celt_pitch_xcorr_fixed_neon(const opus_val16 *_x, const opus_val16 *_y,
+ opus_val32 *xcorr, int len, int max_pitch);
+#define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
+ ((void)(arch), celt_pitch_xcorr_fixed_neon(_x, _y, xcorr, len, max_pitch))
+
+#else /* End CPU_AARCH64. Begin CPU_ARM */
+
# if defined(OPUS_ARM_MAY_HAVE_NEON)
opus_val32 celt_pitch_xcorr_neon(const opus_val16 *_x, const opus_val16 *_y,
opus_val32 *xcorr, int len, int max_pitch);
@@ -51,6 +60,7 @@ opus_val32 celt_pitch_xcorr_edsp(const opus_val16 *_x, const opus_val16 *_y,
# define celt_pitch_xcorr(_x, _y, xcorr, len, max_pitch, arch) \
((void)(arch),PRESUME_NEON(celt_pitch_xcorr)(_x, _y, xcorr, len, max_pitch))
# endif
+#endif /* End CPU_ARM */
#else /* Start !FIXED_POINT */
/* Float case */
diff --git a/configure.ac b/configure.ac
index a150d87..744c9b4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -473,6 +473,11 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[
AC_DEFINE([OPUS_ARM_MAY_HAVE_NEON], 1, [Define if compiler support NEON instructions])
]
)
+ ],
+ [aarch64],
+ [
+ cpu_aarch64=yes
+ AC_DEFINE([CPU_AARCH64], 1, [Compiling for Aarch64])
]
)
@@ -658,6 +663,7 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[
])
AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"])
+AM_CONDITIONAL([CPU_AARCH64], [test "$cpu_aarch64" = "yes"])
AM_CONDITIONAL([HAVE_ARM_NEON_INTR],
[test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"])
AM_CONDITIONAL([HAVE_ARM_NE10],
--
1.9.1
More information about the opus
mailing list