From 26d3d3af6ace50ee20c474771a580e38f86c7864 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Mon, 30 Mar 2015 12:31:46 -0700 Subject: [PATCH] Enable 16x16 Hadamard transform in SATD based mode decision This commit replaces the 16x16 2D-DCT transform with Hadamard transform for RTC coding mode. It reduces the CPU cycles cost on 16x16 transform by 5X. Overall it makes the speed -6 encoding speed 1.5% faster without compromise on compression performance. Change-Id: If6c993831dc4c678d841edc804ff395ed37f2a1b --- vp9/common/vp9_rtcd_defs.pl | 4 +-- vp9/encoder/vp9_avg.c | 9 ++++++- vp9/encoder/vp9_pickmode.c | 2 +- vp9/encoder/x86/vp9_avg_intrin_sse2.c | 38 +++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 4 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index ed43010e3..887f407ba 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1112,8 +1112,8 @@ specialize qw/vp9_avg_4x4 sse2/; add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff"; specialize qw/vp9_hadamard_8x8 sse2/; -add_proto qw/void vp9_hadamard_16x16/, "int16_t *coeff"; -specialize qw/vp9_hadamard_16x16/; +add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff"; +specialize qw/vp9_hadamard_16x16 sse2/; add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length"; specialize qw/vp9_satd sse2/; diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c index b5632a20d..58daa3ad4 100644 --- a/vp9/encoder/vp9_avg.c +++ b/vp9/encoder/vp9_avg.c @@ -78,8 +78,15 @@ void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, } // In place 16x16 2D Hadamard transform -void vp9_hadamard_16x16_c(int16_t *coeff) { +void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, + int16_t *coeff) { int idx; + for (idx = 0; idx < 4; ++idx) { + int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + + (idx & 0x01) * 8; + vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); + } + for (idx = 0; idx < 64; ++idx) { int16_t a0 = coeff[0]; int16_t a1 = coeff[64]; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index fa1f94dad..1221b3a92 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -375,7 +375,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, scan_order->scan, scan_order->iscan); break; case TX_16X16: - vp9_fdct16x16(src_diff, coeff, diff_stride); + vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff); vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index 3f78b8a0f..ee3ead49c 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -165,6 +165,44 @@ void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, _mm_storeu_si128((__m128i *)coeff, src[7]); } +void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, + int16_t *coeff) { + int idx; + for (idx = 0; idx < 4; ++idx) { + int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + + (idx & 0x01) * 8; + vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); + } + + for (idx = 0; idx < 64; idx += 8) { + __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); + __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); + __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); + __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); + + __m128i b0 = _mm_add_epi16(coeff0, coeff1); + __m128i b1 = _mm_sub_epi16(coeff0, coeff1); + __m128i b2 = _mm_add_epi16(coeff2, coeff3); + __m128i b3 = _mm_sub_epi16(coeff2, coeff3); + + coeff0 = _mm_add_epi16(b0, b2); + coeff1 = _mm_add_epi16(b1, b3); + coeff0 = _mm_srai_epi16(coeff0, 1); + coeff1 = _mm_srai_epi16(coeff1, 1); + _mm_store_si128((__m128i *)coeff, coeff0); + _mm_store_si128((__m128i *)(coeff + 64), coeff1); + + coeff2 = _mm_sub_epi16(b0, b2); + coeff3 = _mm_sub_epi16(b1, b3); + coeff2 = _mm_srai_epi16(coeff2, 1); + coeff3 = _mm_srai_epi16(coeff3, 1); + _mm_store_si128((__m128i *)(coeff + 128), coeff2); + _mm_store_si128((__m128i *)(coeff + 192), coeff3); + + coeff += 8; + } +} + int16_t vp9_satd_sse2(const int16_t *coeff, int length) { int i; __m128i sum = _mm_load_si128((const __m128i *)coeff); -- 2.40.0