From 760c214519fe69951297985ee8d7cd817a6febc4 Mon Sep 17 00:00:00 2001 From: Kyle Siefring Date: Mon, 1 May 2017 09:19:11 -0700 Subject: [PATCH] block error avx2: sum in 32 bits when possible Add 31bit pairs before unpacking in x86 block error code AVX2 code provides a very minor performance improvement. BUG=webm:1210 Change-Id: I4c82308eaf65741dca2f5c6db9be9c85f905073a --- vp9/encoder/x86/vp9_error_avx2.c | 88 ++++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 27 deletions(-) diff --git a/vp9/encoder/x86/vp9_error_avx2.c b/vp9/encoder/x86/vp9_error_avx2.c index e39027f25..041744163 100644 --- a/vp9/encoder/x86/vp9_error_avx2.c +++ b/vp9/encoder/x86/vp9_error_avx2.c @@ -8,7 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include // AVX2 +#include +#include #include "./vp9_rtcd.h" #include "vpx/vpx_integer.h" @@ -17,55 +18,88 @@ int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz) { - __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; + __m256i sse_reg, ssz_reg; __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; __m256i sse_reg_64hi, ssz_reg_64hi; __m128i sse_reg128, ssz_reg128; int64_t sse; - int i; - const __m256i zero_reg = _mm256_set1_epi16(0); + const __m256i zero_reg = _mm256_setzero_si256(); - // init sse and ssz registerd to zero - sse_reg = _mm256_set1_epi16(0); - ssz_reg = _mm256_set1_epi16(0); - - for (i = 0; i < block_size; i += 16) { - // load 32 bytes from coeff and dqcoeff - coeff_reg = load_tran_low(coeff + i); - dqcoeff_reg = load_tran_low(dqcoeff + i); + // If the block size is 16 then the results will fit in 32 bits. + if (block_size == 16) { + __m256i coeff_reg, dqcoeff_reg, coeff_reg_hi, dqcoeff_reg_hi; + // Load 16 elements for coeff and dqcoeff. + coeff_reg = load_tran_low(coeff); + dqcoeff_reg = load_tran_low(dqcoeff); // dqcoeff - coeff dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); // madd (dqcoeff - coeff) dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); // madd coeff coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); - // expand each double word of madd (dqcoeff - coeff) to quad word - exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); - exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); - // expand each double word of madd (coeff) to quad word - exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); - exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); - // add each quad word of madd (dqcoeff - coeff) and madd (coeff) - sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); - ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); - sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); - ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); + // Save the higher 64 bit of each 128 bit lane. + dqcoeff_reg_hi = _mm256_srli_si256(dqcoeff_reg, 8); + coeff_reg_hi = _mm256_srli_si256(coeff_reg, 8); + // Add the higher 64 bit to the low 64 bit. + dqcoeff_reg = _mm256_add_epi32(dqcoeff_reg, dqcoeff_reg_hi); + coeff_reg = _mm256_add_epi32(coeff_reg, coeff_reg_hi); + // Expand each double word in the lower 64 bits to quad word. + sse_reg = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); + ssz_reg = _mm256_unpacklo_epi32(coeff_reg, zero_reg); + } else { + int i; + assert(block_size % 32 == 0); + sse_reg = zero_reg; + ssz_reg = zero_reg; + + for (i = 0; i < block_size; i += 32) { + __m256i coeff_reg_0, coeff_reg_1, dqcoeff_reg_0, dqcoeff_reg_1; + // Load 32 elements for coeff and dqcoeff. + coeff_reg_0 = load_tran_low(coeff + i); + dqcoeff_reg_0 = load_tran_low(dqcoeff + i); + coeff_reg_1 = load_tran_low(coeff + i + 16); + dqcoeff_reg_1 = load_tran_low(dqcoeff + i + 16); + // dqcoeff - coeff + dqcoeff_reg_0 = _mm256_sub_epi16(dqcoeff_reg_0, coeff_reg_0); + dqcoeff_reg_1 = _mm256_sub_epi16(dqcoeff_reg_1, coeff_reg_1); + // madd (dqcoeff - coeff) + dqcoeff_reg_0 = _mm256_madd_epi16(dqcoeff_reg_0, dqcoeff_reg_0); + dqcoeff_reg_1 = _mm256_madd_epi16(dqcoeff_reg_1, dqcoeff_reg_1); + // madd coeff + coeff_reg_0 = _mm256_madd_epi16(coeff_reg_0, coeff_reg_0); + coeff_reg_1 = _mm256_madd_epi16(coeff_reg_1, coeff_reg_1); + // Add the first madd (dqcoeff - coeff) with the second. + dqcoeff_reg_0 = _mm256_add_epi32(dqcoeff_reg_0, dqcoeff_reg_1); + // Add the first madd (coeff) with the second. + coeff_reg_0 = _mm256_add_epi32(coeff_reg_0, coeff_reg_1); + // Expand each double word of madd (dqcoeff - coeff) to quad word. + exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg_0, zero_reg); + exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg_0, zero_reg); + // expand each double word of madd (coeff) to quad word + exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg_0, zero_reg); + exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg_0, zero_reg); + // Add each quad word of madd (dqcoeff - coeff) and madd (coeff). + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); + sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); + ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); + } } - // save the higher 64 bit of each 128 bit lane + // Save the higher 64 bit of each 128 bit lane. sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); - // add the higher 64 bit to the low 64 bit + // Add the higher 64 bit to the low 64 bit. sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); - // add each 64 bit from each of the 128 bit lane of the 256 bit + // Add each 64 bit from each of the 128 bit lane of the 256 bit. sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1)); ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), _mm256_extractf128_si256(ssz_reg, 1)); - // store the results + // Store the results. _mm_storel_epi64((__m128i *)(&sse), sse_reg128); _mm_storel_epi64((__m128i *)(ssz), ssz_reg128); -- 2.40.0