From: Scott LaVarnway Date: Fri, 20 Jul 2018 10:50:49 +0000 (-0700) Subject: VPX: Improve HBD vpx_hadamard_16x16_sse2() X-Git-Tag: v1.8.0~514^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=17ffb22d4daf4c4645b4c79086d8c27a2768775e;p=libvpx VPX: Improve HBD vpx_hadamard_16x16_sse2() ~12% improvement. Change-Id: Ieca4d870a4c1c5ea2c689e27fc4550fcbab9f867 --- diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index 0362c63c5..bb744a981 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -264,8 +264,9 @@ static void hadamard_col8_sse2(__m128i *in, int iter) { } } -void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride, - tran_low_t *coeff) { +static INLINE void hadamard_8x8_sse2(int16_t const *src_diff, + ptrdiff_t src_stride, tran_low_t *_coeff, + int is_final) { __m128i src[8]; src[0] = _mm_load_si128((const __m128i *)src_diff); src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); @@ -279,37 +280,72 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride, hadamard_col8_sse2(src, 0); hadamard_col8_sse2(src, 1); - store_tran_low(src[0], coeff); - coeff += 8; - store_tran_low(src[1], coeff); - coeff += 8; - store_tran_low(src[2], coeff); - coeff += 8; - store_tran_low(src[3], coeff); - coeff += 8; - store_tran_low(src[4], coeff); - coeff += 8; - store_tran_low(src[5], coeff); - coeff += 8; - store_tran_low(src[6], coeff); - coeff += 8; - store_tran_low(src[7], coeff); + if (is_final) { + store_tran_low(src[0], _coeff); + _coeff += 8; + store_tran_low(src[1], _coeff); + _coeff += 8; + store_tran_low(src[2], _coeff); + _coeff += 8; + store_tran_low(src[3], _coeff); + _coeff += 8; + store_tran_low(src[4], _coeff); + _coeff += 8; + store_tran_low(src[5], _coeff); + _coeff += 8; + store_tran_low(src[6], _coeff); + _coeff += 8; + store_tran_low(src[7], _coeff); + } else { + int16_t *coeff = (int16_t *)_coeff; + _mm_store_si128((__m128i *)coeff, src[0]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[1]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[2]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[3]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[4]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[5]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[6]); + coeff += 8; + _mm_store_si128((__m128i *)coeff, src[7]); + } +} + +void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride, + tran_low_t *coeff) { + hadamard_8x8_sse2(src_diff, src_stride, coeff, 1); } void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { +#if CONFIG_VP9_HIGHBITDEPTH + // For high bitdepths, it is unnecessary to store_tran_low + // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the + // next stage. Output to an intermediate buffer first, then store_tran_low() + // in the final stage. + DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); + int16_t *t_coeff = temp_coeff; +#else + int16_t *t_coeff = coeff; +#endif int idx; for (idx = 0; idx < 4; ++idx) { int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; - vpx_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); + hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64), + 0); } for (idx = 0; idx < 64; idx += 8) { - __m128i coeff0 = load_tran_low(coeff); - __m128i coeff1 = load_tran_low(coeff + 64); - __m128i coeff2 = load_tran_low(coeff + 128); - __m128i coeff3 = load_tran_low(coeff + 192); + __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); @@ -332,6 +368,7 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride, store_tran_low(coeff3, coeff + 192); coeff += 8; + t_coeff += 8; } }