From 4fa0727fbcf9bb790139fa34b10d610af824a7ef Mon Sep 17 00:00:00 2001 From: =?utf8?q?Matthias=20R=C3=A4ncker?= Date: Fri, 21 Sep 2018 15:33:18 +0200 Subject: [PATCH] sanitizer: sse2 - fix unaligned double stores MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Matthias Räncker Change-Id: I838c8678e62f7cff13387b84d4f3ea42710a67ea --- vpx_dsp/x86/loopfilter_sse2.c | 24 ++++++++---------------- vpx_dsp/x86/mem_sse2.h | 3 +++ 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/vpx_dsp/x86/loopfilter_sse2.c b/vpx_dsp/x86/loopfilter_sse2.c index 1a76d670e..853c4d270 100644 --- a/vpx_dsp/x86/loopfilter_sse2.c +++ b/vpx_dsp/x86/loopfilter_sse2.c @@ -1627,16 +1627,12 @@ static INLINE void transpose(unsigned char *src[], int in_p, x5 = _mm_unpacklo_epi16(x2, x3); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 0 * out_p), - _mm_castsi128_pd(x6)); // 00 10 20 30 40 50 60 70 - _mm_storeh_pd((double *)(out + 1 * out_p), - _mm_castsi128_pd(x6)); // 01 11 21 31 41 51 61 71 + mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70 + mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 x7 = _mm_unpackhi_epi32(x4, x5); - _mm_storel_pd((double *)(out + 2 * out_p), - _mm_castsi128_pd(x7)); // 02 12 22 32 42 52 62 72 - _mm_storeh_pd((double *)(out + 3 * out_p), - _mm_castsi128_pd(x7)); // 03 13 23 33 43 53 63 73 + mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72 + mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 x4 = _mm_unpackhi_epi16(x0, x1); @@ -1644,17 +1640,13 @@ static INLINE void transpose(unsigned char *src[], int in_p, x5 = _mm_unpackhi_epi16(x2, x3); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 x6 = _mm_unpacklo_epi32(x4, x5); - _mm_storel_pd((double *)(out + 4 * out_p), - _mm_castsi128_pd(x6)); // 04 14 24 34 44 54 64 74 - _mm_storeh_pd((double *)(out + 5 * out_p), - _mm_castsi128_pd(x6)); // 05 15 25 35 45 55 65 75 + mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74 + mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 x7 = _mm_unpackhi_epi32(x4, x5); - _mm_storel_pd((double *)(out + 6 * out_p), - _mm_castsi128_pd(x7)); // 06 16 26 36 46 56 66 76 - _mm_storeh_pd((double *)(out + 7 * out_p), - _mm_castsi128_pd(x7)); // 07 17 27 37 47 57 67 77 + mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76 + mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77 } while (++idx8x8 < num_8x8_to_transpose); } diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h index 48dc97970..5209a0628 100644 --- a/vpx_dsp/x86/mem_sse2.h +++ b/vpx_dsp/x86/mem_sse2.h @@ -26,6 +26,9 @@ static INLINE uint32_t loadu_uint32(const void *src) { return v; } +#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8) +#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8) + static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) { return _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); -- 2.40.0