[Vorbis-dev] [RFC PATCH v1 3/3] mdct: implement arm simd implementation for mdct
Viswanath Puttagunta
viswanath.puttagunta at linaro.org
Wed Sep 10 12:15:09 PDT 2014
Signed-off-by: Viswanath Puttagunta <viswanath.puttagunta at linaro.org>
---
lib/mdct.c | 102 +---------------------------------------------
lib/simd/neon_simd.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++
lib/simd/no_simd.c | 101 +++++++++++++++++++++++++++++++++++++++++++++
lib/simd/simd.h | 11 +++++
4 files changed, 224 insertions(+), 101 deletions(-)
diff --git a/lib/mdct.c b/lib/mdct.c
index fbc7cf0..633fc49 100644
--- a/lib/mdct.c
+++ b/lib/mdct.c
@@ -45,6 +45,7 @@
#include "mdct.h"
#include "os.h"
#include "misc.h"
+#include "simd.h"
/* build lookups for trig functions; also pre-figure scaling and
some window function algebra. */
@@ -213,107 +214,6 @@ STIN void mdct_butterfly_32(DATA_TYPE *x){
}
-/* N point first stage butterfly (in place, 2 register) */
-STIN void mdct_butterfly_first(DATA_TYPE *T,
- DATA_TYPE *x,
- int points){
-
- DATA_TYPE *x1 = x + points - 8;
- DATA_TYPE *x2 = x + (points>>1) - 8;
- REG_TYPE r0;
- REG_TYPE r1;
-
- do{
-
- r0 = x1[6] - x2[6];
- r1 = x1[7] - x2[7];
- x1[6] += x2[6];
- x1[7] += x2[7];
- x2[6] = MULT_NORM(r1 * T[1] + r0 * T[0]);
- x2[7] = MULT_NORM(r1 * T[0] - r0 * T[1]);
-
- r0 = x1[4] - x2[4];
- r1 = x1[5] - x2[5];
- x1[4] += x2[4];
- x1[5] += x2[5];
- x2[4] = MULT_NORM(r1 * T[5] + r0 * T[4]);
- x2[5] = MULT_NORM(r1 * T[4] - r0 * T[5]);
-
- r0 = x1[2] - x2[2];
- r1 = x1[3] - x2[3];
- x1[2] += x2[2];
- x1[3] += x2[3];
- x2[2] = MULT_NORM(r1 * T[9] + r0 * T[8]);
- x2[3] = MULT_NORM(r1 * T[8] - r0 * T[9]);
-
- r0 = x1[0] - x2[0];
- r1 = x1[1] - x2[1];
- x1[0] += x2[0];
- x1[1] += x2[1];
- x2[0] = MULT_NORM(r1 * T[13] + r0 * T[12]);
- x2[1] = MULT_NORM(r1 * T[12] - r0 * T[13]);
-
- x1-=8;
- x2-=8;
- T+=16;
-
- }while(x2>=x);
-}
-
-/* N/stage point generic N stage butterfly (in place, 2 register) */
-STIN void mdct_butterfly_generic(DATA_TYPE *T,
- DATA_TYPE *x,
- int points,
- int trigint){
-
- DATA_TYPE *x1 = x + points - 8;
- DATA_TYPE *x2 = x + (points>>1) - 8;
- REG_TYPE r0;
- REG_TYPE r1;
-
- do{
-
- r0 = x1[6] - x2[6];
- r1 = x1[7] - x2[7];
- x1[6] += x2[6];
- x1[7] += x2[7];
- x2[6] = MULT_NORM(r1 * T[1] + r0 * T[0]);
- x2[7] = MULT_NORM(r1 * T[0] - r0 * T[1]);
-
- T+=trigint;
-
- r0 = x1[4] - x2[4];
- r1 = x1[5] - x2[5];
- x1[4] += x2[4];
- x1[5] += x2[5];
- x2[4] = MULT_NORM(r1 * T[1] + r0 * T[0]);
- x2[5] = MULT_NORM(r1 * T[0] - r0 * T[1]);
-
- T+=trigint;
-
- r0 = x1[2] - x2[2];
- r1 = x1[3] - x2[3];
- x1[2] += x2[2];
- x1[3] += x2[3];
- x2[2] = MULT_NORM(r1 * T[1] + r0 * T[0]);
- x2[3] = MULT_NORM(r1 * T[0] - r0 * T[1]);
-
- T+=trigint;
-
- r0 = x1[0] - x2[0];
- r1 = x1[1] - x2[1];
- x1[0] += x2[0];
- x1[1] += x2[1];
- x2[0] = MULT_NORM(r1 * T[1] + r0 * T[0]);
- x2[1] = MULT_NORM(r1 * T[0] - r0 * T[1]);
-
- T+=trigint;
- x1-=8;
- x2-=8;
-
- }while(x2>=x);
-}
-
STIN void mdct_butterflies(mdct_lookup *init,
DATA_TYPE *x,
int points){
diff --git a/lib/simd/neon_simd.c b/lib/simd/neon_simd.c
index 381d704..ab55205 100644
--- a/lib/simd/neon_simd.c
+++ b/lib/simd/neon_simd.c
@@ -16,6 +16,45 @@
#include <arm_neon.h>
#include "simd.h"
+#ifdef MDCT_INTEGERIZED
+
+#define DATA_TYPE32x4x2_t int32x4x2_t
+#define DATA_TYPE32x4_t int32x4_t
+#define DATA_TYPE32x2_t int32x2_t
+#define ONES_MINUS_ONE 0x00000001FFFFFFFF // {1, -1}
+#define vcreate_DATA_TYPE(a) vcreate_i32(a)
+#define vcombine_DATA_TYPE(a,b) vcombine_i32(a,b)
+#define vld2q_DATA_TYPE(a) vld2q_i32(a)
+#define vsubq_DATA_TYPE(a,b) vsubq_i32(a,b)
+#define vaddq_DATA_TYPE(a,b) vaddq_i32(a,b)
+#define vst2q_DATA_TYPE(a,b) vst2q_i32(a,b)
+#define vld1_DATA_TYPE(a) vld1_i32(a)
+#define vrev64_DATA_TYPE(a) vrev64_i32(a)
+#define vmul_n_DATA_TYPE(a,b) vmul_n_i32(a,b)
+#define vmulq_DATA_TYPE(a,b) vmulq_i32(a,b)
+#define vst1q_DATA_TYPE(a,b) vst1q_i32(a,b)
+
+#else
+
+#define DATA_TYPE32x4x2_t float32x4x2_t
+#define DATA_TYPE32x4_t float32x4_t
+#define DATA_TYPE32x2_t float32x2_t
+#define ONES_MINUS_ONE 0xbf8000003f800000 //{-1.0, 1.0}
+#define vcreate_DATA_TYPE(a) vcreate_f32(a)
+#define vcombine_DATA_TYPE(a,b) vcombine_f32(a,b)
+#define vld2q_DATA_TYPE(a) vld2q_f32(a)
+#define vsubq_DATA_TYPE(a,b) vsubq_f32(a,b)
+#define vaddq_DATA_TYPE(a,b) vaddq_f32(a,b)
+#define vst2q_DATA_TYPE(a,b) vst2q_f32(a,b)
+#define vld1_DATA_TYPE(a) vld1_f32(a)
+#define vrev64_DATA_TYPE(a) vrev64_f32(a)
+#define vmul_n_DATA_TYPE(a,b) vmul_n_f32(a,b)
+#define vmulq_DATA_TYPE(a,b) vmulq_f32(a,b)
+#define vst1q_DATA_TYPE(a,b) vst1q_f32(a,b)
+#define vrev64q_DATA_TYPE(a,b) vrev64q_f32(a,b)
+
+#endif
+
/* wave_operation: Implements pseudo-code
* for(i = 0; i < n; i++)
* a[i] = a[i]*b[n-i-1] + a[i]*c[i]
@@ -45,3 +84,75 @@ void wave_operation(float *a, float *b, float *c, int32_t n) {
vst1q_f32(ai, result);
}
}
+
+void mdct_butterfly_generic(DATA_TYPE *T, DATA_TYPE *x,
+ int points, int trigint) {
+ DATA_TYPE *x1 = x + points - 8;
+ DATA_TYPE *x2 = x + (points>>1) - 8;
+ DATA_TYPE32x4x2_t k1, k2;
+ DATA_TYPE32x4_t r0, r1;
+ DATA_TYPE32x4_t TTr0[2], TTir1[2];
+ DATA_TYPE32x2_t Tr0[4], Tir1[4];
+ DATA_TYPE32x2_t dones = vcreate_DATA_TYPE(ONES_MINUS_ONE);
+ DATA_TYPE32x4_t ones = vcombine_DATA_TYPE(dones, dones);
+
+ do{
+ k1 = vld2q_DATA_TYPE(x1);
+ k2 = vld2q_DATA_TYPE(x2);
+
+ r0 = vsubq_DATA_TYPE(k1.val[0], k2.val[0]);
+ r1 = vsubq_DATA_TYPE(k1.val[1], k2.val[1]);
+
+ k1.val[0] = vaddq_DATA_TYPE(k1.val[0], k2.val[0]);
+ k1.val[1] = vaddq_DATA_TYPE(k1.val[1], k2.val[1]);
+ vst2q_DATA_TYPE(x1, k1);
+
+ Tr0[0] = vld1_DATA_TYPE(T);
+ T += trigint;
+
+ Tr0[1] = vld1_DATA_TYPE(T);
+ T += trigint;
+
+ Tr0[2] = vld1_DATA_TYPE(T);
+ T += trigint;
+
+ Tr0[3] = vld1_DATA_TYPE(T);
+ T += trigint;
+
+ Tir1[0] = vrev64_DATA_TYPE(Tr0[0]);
+ Tir1[1] = vrev64_DATA_TYPE(Tr0[1]);
+ Tir1[2] = vrev64_DATA_TYPE(Tr0[2]);
+ Tir1[3] = vrev64_DATA_TYPE(Tr0[3]);
+
+ Tr0[0] = vmul_n_DATA_TYPE(Tr0[0], r0[3]);
+ Tr0[1] = vmul_n_DATA_TYPE(Tr0[1], r0[2]);
+ Tr0[2] = vmul_n_DATA_TYPE(Tr0[2], r0[1]);
+ Tr0[3] = vmul_n_DATA_TYPE(Tr0[3], r0[0]);
+
+ Tir1[0] = vmul_n_DATA_TYPE(Tir1[0], r1[3]);
+ Tir1[1] = vmul_n_DATA_TYPE(Tir1[1], r1[2]);
+ Tir1[2] = vmul_n_DATA_TYPE(Tir1[2], r1[1]);
+ Tir1[3] = vmul_n_DATA_TYPE(Tir1[3], r1[0]);
+
+ TTr0[0] = vcombine_DATA_TYPE(Tr0[3], Tr0[2]);
+ TTr0[0] = vmulq_DATA_TYPE(TTr0[0], ones);
+ TTr0[1] = vcombine_DATA_TYPE(Tr0[1], Tr0[0]);
+ TTr0[1] = vmulq_DATA_TYPE(TTr0[1], ones);
+
+ TTir1[0] = vcombine_DATA_TYPE(Tir1[3], Tir1[2]);
+ TTir1[1] = vcombine_DATA_TYPE(Tir1[1], Tir1[0]);
+ k2.val[0] = vaddq_DATA_TYPE(TTr0[0], TTir1[0]);
+ k2.val[1] = vaddq_DATA_TYPE(TTr0[1], TTir1[1]);
+
+ vst1q_DATA_TYPE(x2, k2.val[0]);
+ vst1q_DATA_TYPE(x2+4, k2.val[1]);
+
+ x1 -= 8;
+ x2 -= 8;
+ }while(x2 >= x);
+}
+
+void mdct_butterfly_first(DATA_TYPE *T, DATA_TYPE *x, int points)
+{
+ mdct_butterfly_generic(T, x, points, 4);
+}
diff --git a/lib/simd/no_simd.c b/lib/simd/no_simd.c
index e8efacb..a2cc6c4 100644
--- a/lib/simd/no_simd.c
+++ b/lib/simd/no_simd.c
@@ -26,3 +26,104 @@ void wave_operation(float *a, float *b, float *c, int32_t n) {
for (i = 0; i < n; i++)
a[i] = a[i]*b[n-i-1] + b[i]*c[i];
}
+
+/* N/stage point generic N stage butterfly (in place, 2 register) */
+void mdct_butterfly_generic(DATA_TYPE *T,
+ DATA_TYPE *x,
+ int points,
+ int trigint){
+
+ DATA_TYPE *x1 = x + points - 8;
+ DATA_TYPE *x2 = x + (points>>1) - 8;
+ REG_TYPE r0;
+ REG_TYPE r1;
+
+ do{
+
+ r0 = x1[6] - x2[6];
+ r1 = x1[7] - x2[7];
+ x1[6] += x2[6];
+ x1[7] += x2[7];
+ x2[6] = MULT_NORM(r1 * T[1] + r0 * T[0]);
+ x2[7] = MULT_NORM(r1 * T[0] - r0 * T[1]);
+
+ T+=trigint;
+
+ r0 = x1[4] - x2[4];
+ r1 = x1[5] - x2[5];
+ x1[4] += x2[4];
+ x1[5] += x2[5];
+ x2[4] = MULT_NORM(r1 * T[1] + r0 * T[0]);
+ x2[5] = MULT_NORM(r1 * T[0] - r0 * T[1]);
+
+ T+=trigint;
+
+ r0 = x1[2] - x2[2];
+ r1 = x1[3] - x2[3];
+ x1[2] += x2[2];
+ x1[3] += x2[3];
+ x2[2] = MULT_NORM(r1 * T[1] + r0 * T[0]);
+ x2[3] = MULT_NORM(r1 * T[0] - r0 * T[1]);
+
+ T+=trigint;
+
+ r0 = x1[0] - x2[0];
+ r1 = x1[1] - x2[1];
+ x1[0] += x2[0];
+ x1[1] += x2[1];
+ x2[0] = MULT_NORM(r1 * T[1] + r0 * T[0]);
+ x2[1] = MULT_NORM(r1 * T[0] - r0 * T[1]);
+
+ T+=trigint;
+ x1-=8;
+ x2-=8;
+
+ }while(x2>=x);
+}
+
+/* N point first stage butterfly (in place, 2 register) */
+void mdct_butterfly_first(DATA_TYPE *T,
+ DATA_TYPE *x,
+ int points){
+
+ DATA_TYPE *x1 = x + points - 8;
+ DATA_TYPE *x2 = x + (points>>1) - 8;
+ REG_TYPE r0;
+ REG_TYPE r1;
+
+ do{
+
+ r0 = x1[6] - x2[6];
+ r1 = x1[7] - x2[7];
+ x1[6] += x2[6];
+ x1[7] += x2[7];
+ x2[6] = MULT_NORM(r1 * T[1] + r0 * T[0]);
+ x2[7] = MULT_NORM(r1 * T[0] - r0 * T[1]);
+
+ r0 = x1[4] - x2[4];
+ r1 = x1[5] - x2[5];
+ x1[4] += x2[4];
+ x1[5] += x2[5];
+ x2[4] = MULT_NORM(r1 * T[5] + r0 * T[4]);
+ x2[5] = MULT_NORM(r1 * T[4] - r0 * T[5]);
+
+ r0 = x1[2] - x2[2];
+ r1 = x1[3] - x2[3];
+ x1[2] += x2[2];
+ x1[3] += x2[3];
+ x2[2] = MULT_NORM(r1 * T[9] + r0 * T[8]);
+ x2[3] = MULT_NORM(r1 * T[8] - r0 * T[9]);
+
+ r0 = x1[0] - x2[0];
+ r1 = x1[1] - x2[1];
+ x1[0] += x2[0];
+ x1[1] += x2[1];
+ x2[0] = MULT_NORM(r1 * T[13] + r0 * T[12]);
+ x2[1] = MULT_NORM(r1 * T[12] - r0 * T[13]);
+
+ x1-=8;
+ x2-=8;
+ T+=16;
+
+ }while(x2>=x);
+}
diff --git a/lib/simd/simd.h b/lib/simd/simd.h
index 8565434..a9e6ccf 100644
--- a/lib/simd/simd.h
+++ b/lib/simd/simd.h
@@ -18,6 +18,7 @@
#define SIMD_H
#include "os.h"
+#include "mdct.h"
/* wave_operation: Must implement pseudo-code
* for(i = 0; i < n; i++)
@@ -26,4 +27,14 @@
*/
void wave_operation(float *a, float *b, float *c, int32_t n);
+/* N/stage point generic N stage butterfly (in place, 2 register) */
+void mdct_butterfly_generic(DATA_TYPE *T,
+ DATA_TYPE *x,
+ int points,
+ int trigint);
+
+/* N point first stage butterfly (in place, 2 register) */
+void mdct_butterfly_first(DATA_TYPE *T,
+ DATA_TYPE *x,
+ int points);
#endif
--
1.7.9.5
More information about the Vorbis-dev
mailing list