From: Johann Date: Tue, 22 Aug 2017 21:25:27 +0000 (-0700) Subject: quantize ssse3: copy style from sse2 X-Git-Tag: v1.7.0~222^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b9c1dcc5fa3674f6d4fdcfb5d7d0e324216d6bb3;p=libvpx quantize ssse3: copy style from sse2 Change-Id: I53f8a160e640c674ea035fc112e207b6dca42598 --- diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c index b074a3e01..ddd0a66ec 100644 --- a/vpx_dsp/x86/quantize_ssse3.c +++ b/vpx_dsp/x86/quantize_ssse3.c @@ -23,104 +23,88 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { const __m128i zero = _mm_setzero_si128(); + intptr_t index = 16; + + __m128i zbin, round, quant, dequant, shift; __m128i coeff0, coeff1; - __m128i eob; - __m128i zbin; - __m128i round, quant, dequant, shift; - intptr_t index = 0; + __m128i qcoeff0, qcoeff1; + __m128i cmp_mask0, cmp_mask1; + __m128i qtmp0, qtmp1; + __m128i zero_coeff0, zero_coeff1, iscan0, iscan1; + __m128i eob, eob0, eob1; + (void)scan_ptr; (void)skip_block; assert(!skip_block); - // Setup global values - { - const __m128i one = _mm_set1_epi16(1); - zbin = _mm_load_si128((const __m128i *)zbin_ptr); - // x86 has no "greater *or equal* comparison. Subtract 1 from zbin so - // it is a strict "greater" comparison. - zbin = _mm_sub_epi16(zbin, one); - round = _mm_load_si128((const __m128i *)round_ptr); - quant = _mm_load_si128((const __m128i *)quant_ptr); - dequant = _mm_load_si128((const __m128i *)dequant_ptr); - shift = _mm_load_si128((const __m128i *)quant_shift_ptr); - } - - { - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - __m128i zero_coeff0, zero_coeff1; - __m128i iscan0, iscan1; - __m128i eob1; - - // Do DC and first 15 AC - coeff0 = load_tran_low(coeff_ptr + index); - coeff1 = load_tran_low(coeff_ptr + index + 8); - - qcoeff0 = _mm_abs_epi16(coeff0); - qcoeff1 = _mm_abs_epi16(coeff1); - - cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); - // Overwrite DC component. - zbin = _mm_unpackhi_epi64(zbin, zbin); - cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); - - qcoeff0 = _mm_adds_epi16(qcoeff0, round); - round = _mm_unpackhi_epi64(round, round); - qcoeff1 = _mm_adds_epi16(qcoeff1, round); - - qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); - quant = _mm_unpackhi_epi64(quant, quant); - qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); - - qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); - qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); - - qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); - shift = _mm_unpackhi_epi64(shift, shift); - qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); - - // Reinsert signs - qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); - qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); - - // Mask out zbin threshold coeffs - qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); - qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - - store_tran_low(qcoeff0, qcoeff_ptr + index); - store_tran_low(qcoeff1, qcoeff_ptr + index + 8); - - coeff0 = _mm_mullo_epi16(qcoeff0, dequant); - dequant = _mm_unpackhi_epi64(dequant, dequant); - coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - - store_tran_low(coeff0, dqcoeff_ptr + index); - store_tran_low(coeff1, dqcoeff_ptr + index + 8); - - // Scan for eob - zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); - zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + index)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + index + 8)); - // Add one to convert from indices to counts - iscan0 = _mm_sub_epi16(iscan0, cmp_mask0); - iscan1 = _mm_sub_epi16(iscan1, cmp_mask1); - eob = _mm_andnot_si128(zero_coeff0, iscan0); - eob1 = _mm_andnot_si128(zero_coeff1, iscan1); - eob = _mm_max_epi16(eob, eob1); - } - index += 16; - - // AC only loop + // Setup global values. + zbin = _mm_load_si128((const __m128i *)zbin_ptr); + // x86 has no "greater *or equal* comparison. Subtract 1 from zbin so + // it is a strict "greater" comparison. + zbin = _mm_sub_epi16(zbin, _mm_set1_epi16(1)); + round = _mm_load_si128((const __m128i *)round_ptr); + quant = _mm_load_si128((const __m128i *)quant_ptr); + dequant = _mm_load_si128((const __m128i *)dequant_ptr); + shift = _mm_load_si128((const __m128i *)quant_shift_ptr); + + // Do DC and first 15 AC. + coeff0 = load_tran_low(coeff_ptr); + coeff1 = load_tran_low(coeff_ptr + 8); + + qcoeff0 = _mm_abs_epi16(coeff0); + qcoeff1 = _mm_abs_epi16(coeff1); + + cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin); + zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC + cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin); + + qcoeff0 = _mm_adds_epi16(qcoeff0, round); + round = _mm_unpackhi_epi64(round, round); + qcoeff1 = _mm_adds_epi16(qcoeff1, round); + + qtmp0 = _mm_mulhi_epi16(qcoeff0, quant); + quant = _mm_unpackhi_epi64(quant, quant); + qtmp1 = _mm_mulhi_epi16(qcoeff1, quant); + + qtmp0 = _mm_add_epi16(qtmp0, qcoeff0); + qtmp1 = _mm_add_epi16(qtmp1, qcoeff1); + + qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); + shift = _mm_unpackhi_epi64(shift, shift); + qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); + + // Reinsert signs + qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); + qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); + + // Mask out zbin threshold coeffs + qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); + qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); + + store_tran_low(qcoeff0, qcoeff_ptr); + store_tran_low(qcoeff1, qcoeff_ptr + 8); + + coeff0 = _mm_mullo_epi16(qcoeff0, dequant); + dequant = _mm_unpackhi_epi64(dequant, dequant); + coeff1 = _mm_mullo_epi16(qcoeff1, dequant); + + store_tran_low(coeff0, dqcoeff_ptr); + store_tran_low(coeff1, dqcoeff_ptr + 8); + + // Scan for eob. + zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); + zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); + iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr)); + iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8)); + // Add one to convert from indices to counts + iscan0 = _mm_sub_epi16(iscan0, cmp_mask0); + iscan1 = _mm_sub_epi16(iscan1, cmp_mask1); + eob = _mm_andnot_si128(zero_coeff0, iscan0); + eob1 = _mm_andnot_si128(zero_coeff1, iscan1); + eob = _mm_max_epi16(eob, eob1); + + // AC only loop. while (index < n_coeffs) { - __m128i qcoeff0, qcoeff1; - __m128i qtmp0, qtmp1; - __m128i cmp_mask0, cmp_mask1; - __m128i zero_coeff0, zero_coeff1; - __m128i iscan0, iscan1; - __m128i eob0, eob1; - coeff0 = load_tran_low(coeff_ptr + index); coeff1 = load_tran_low(coeff_ptr + index + 8); @@ -142,11 +126,9 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff0 = _mm_mulhi_epi16(qtmp0, shift); qcoeff1 = _mm_mulhi_epi16(qtmp1, shift); - // Reinsert signs qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0); qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1); - // Mask out zbin threshold coeffs qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); @@ -159,12 +141,10 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(coeff0, dqcoeff_ptr + index); store_tran_low(coeff1, dqcoeff_ptr + index + 8); - // Scan for eob zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero); zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + index)); iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + index + 8)); - // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, cmp_mask0); iscan1 = _mm_sub_epi16(iscan1, cmp_mask1); eob0 = _mm_andnot_si128(zero_coeff0, iscan0); @@ -175,7 +155,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, index += 16; } - // Accumulate EOB + // Accumulate eob. { __m128i eob_shuffled; eob_shuffled = _mm_shuffle_epi32(eob, 0xe);