[Speex-dev] [PATCH] Fix miscompile of SSE resampler
Thorvald Natvig
thorvald at natvig.com
Mon Oct 26 09:41:59 PDT 2009
From: Thorvald Natvig <slicer at users.sourceforge.net>
Some optimizing compilers miscompile the current SSE optimizations when
full optimizations are enabled. By using output value pointer instead of
a return value, we can bypass this misbehaviour.
---
libspeex/resample.c | 8 ++++----
libspeex/resample_sse.h | 24 ++++++++----------------
2 files changed, 12 insertions(+), 20 deletions(-)
diff --git a/libspeex/resample.c b/libspeex/resample.c
index 7b5a308..8131380 100644
--- a/libspeex/resample.c
+++ b/libspeex/resample.c
@@ -361,7 +361,7 @@ static int resampler_basic_direct_single(SpeexResamplerState *st, spx_uint32_t c
sum = accum[0] + accum[1] + accum[2] + accum[3];
*/
#else
- sum = inner_product_single(sinc, iptr, N);
+ inner_product_single(&sum, sinc, iptr, N);
#endif
out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 15), 32767);
@@ -412,7 +412,7 @@ static int resampler_basic_direct_double(SpeexResamplerState *st, spx_uint32_t c
}
sum = accum[0] + accum[1] + accum[2] + accum[3];
#else
- sum = inner_product_double(sinc, iptr, N);
+ inner_product_double(&sum, sinc, iptr, N);
#endif
out[out_stride * out_sample++] = PSHR32(sum, 15);
@@ -472,7 +472,7 @@ static int resampler_basic_interpolate_single(SpeexResamplerState *st, spx_uint3
sum = MULT16_32_Q15(interp[0],SHR32(accum[0], 1)) + MULT16_32_Q15(interp[1],SHR32(accum[1], 1)) + MULT16_32_Q15(interp[2],SHR32(accum[2], 1)) + MULT16_32_Q15(interp[3],SHR32(accum[3], 1));
#else
cubic_coef(frac, interp);
- sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+ interpolate_product_single(&sum, iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
#endif
out[out_stride * out_sample++] = SATURATE32(PSHR32(sum, 14), 32767);
@@ -534,7 +534,7 @@ static int resampler_basic_interpolate_double(SpeexResamplerState *st, spx_uint3
sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]);
#else
cubic_coef(frac, interp);
- sum = interpolate_product_double(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
+ interpolate_product_double(&sum, iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp);
#endif
out[out_stride * out_sample++] = PSHR32(sum,15);
diff --git a/libspeex/resample_sse.h b/libspeex/resample_sse.h
index 64be8a1..86ff35e 100644
--- a/libspeex/resample_sse.h
+++ b/libspeex/resample_sse.h
@@ -37,10 +37,9 @@
#include <xmmintrin.h>
#define OVERRIDE_INNER_PRODUCT_SINGLE
-static inline float inner_product_single(const float *a, const float *b, unsigned int len)
+static inline void inner_product_single(float *ret, const float *a, const float *b, unsigned int len)
{
int i;
- float ret;
__m128 sum = _mm_setzero_ps();
for (i=0;i<len;i+=8)
{
@@ -49,14 +48,12 @@ static inline float inner_product_single(const float *a, const float *b, unsigne
}
sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
- _mm_store_ss(&ret, sum);
- return ret;
+ _mm_store_ss(ret, sum);
}
#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
-static inline float interpolate_product_single(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+static inline void interpolate_product_single(float *ret, const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
int i;
- float ret;
__m128 sum = _mm_setzero_ps();
__m128 f = _mm_loadu_ps(frac);
for(i=0;i<len;i+=2)
@@ -67,18 +64,16 @@ static inline float interpolate_product_single(const float *a, const float *b, u
sum = _mm_mul_ps(f, sum);
sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
- _mm_store_ss(&ret, sum);
- return ret;
+ _mm_store_ss(ret, sum);
}
#ifdef _USE_SSE2
#include <emmintrin.h>
#define OVERRIDE_INNER_PRODUCT_DOUBLE
-static inline double inner_product_double(const float *a, const float *b, unsigned int len)
+static inline void inner_product_double(double *ret, const float *a, const float *b, unsigned int len)
{
int i;
- double ret;
__m128d sum = _mm_setzero_pd();
__m128 t;
for (i=0;i<len;i+=8)
@@ -92,14 +87,12 @@ static inline double inner_product_double(const float *a, const float *b, unsign
sum = _mm_add_pd(sum, _mm_cvtps_pd(_mm_movehl_ps(t, t)));
}
sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
- _mm_store_sd(&ret, sum);
- return ret;
+ _mm_store_sd(ret, sum);
}
#define OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
-static inline double interpolate_product_double(const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
+static inline void interpolate_product_double(double *ret, const float *a, const float *b, unsigned int len, const spx_uint32_t oversample, float *frac) {
int i;
- double ret;
__m128d sum;
__m128d sum1 = _mm_setzero_pd();
__m128d sum2 = _mm_setzero_pd();
@@ -121,8 +114,7 @@ static inline double interpolate_product_double(const float *a, const float *b,
sum2 = _mm_mul_pd(f2, sum2);
sum = _mm_add_pd(sum1, sum2);
sum = _mm_add_sd(sum, _mm_unpackhi_pd(sum, sum));
- _mm_store_sd(&ret, sum);
- return ret;
+ _mm_store_sd(ret, sum);
}
#endif
--
1.6.4.msysgit.0.19.gd78f4
More information about the Speex-dev
mailing list