From: Even Rouault Date: Fri, 1 Sep 2017 20:09:58 +0000 (+0200) Subject: opj_v4dwt_decode_step1_sse(): rework a bit to improve code generation X-Git-Tag: v2.3.0~39^2~1 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=c1e0fba0c46cb528a08b89b986e86ff0f4792558;p=openjpeg opj_v4dwt_decode_step1_sse(): rework a bit to improve code generation --- diff --git a/src/lib/openjp2/dwt.c b/src/lib/openjp2/dwt.c index 71597f81..719c7330 100644 --- a/src/lib/openjp2/dwt.c +++ b/src/lib/openjp2/dwt.c @@ -2274,14 +2274,19 @@ static void opj_v4dwt_decode_step1_sse(opj_v4_t* w, __m128* OPJ_RESTRICT vw = (__m128*) w; OPJ_UINT32 i; /* 4x unrolled loop */ - for (i = start; i + 3 < end; i += 4) { - vw[2 * i] = _mm_mul_ps(vw[2 * i], c); - vw[2 * i + 2] = _mm_mul_ps(vw[2 * i + 2], c); - vw[2 * i + 4] = _mm_mul_ps(vw[2 * i + 4], c); - vw[2 * i + 6] = _mm_mul_ps(vw[2 * i + 6], c); - } - for (; i < end; ++i) { - vw[2 * i] = _mm_mul_ps(vw[2 * i], c); + vw += 2 * start; + for (i = start; i + 3 < end; i += 4, vw += 8) { + __m128 xmm0 = _mm_mul_ps(vw[0], c); + __m128 xmm2 = _mm_mul_ps(vw[2], c); + __m128 xmm4 = _mm_mul_ps(vw[4], c); + __m128 xmm6 = _mm_mul_ps(vw[6], c); + vw[0] = xmm0; + vw[2] = xmm2; + vw[4] = xmm4; + vw[6] = xmm6; + } + for (; i < end; ++i, vw += 2) { + vw[0] = _mm_mul_ps(vw[0], c); } }