From 5b87f1bb5a6996e0595b3cb78971a42ca91a97ca Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Tue, 24 Feb 2015 14:48:12 -0800 Subject: [PATCH] Fix high bit-depth loop-filter sse2 compiling issue - part 4 Change-Id: I39f56f60425836f2e1ec07da71edd4810a4c78bb --- .../x86/vp9_high_loopfilter_intrin_sse2.c | 70 +++++++++++++------ 1 file changed, 48 insertions(+), 22 deletions(-) diff --git a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c index 7e63f389e..8228fe6c3 100644 --- a/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c +++ b/vp9/common/x86/vp9_high_loopfilter_intrin_sse2.c @@ -706,15 +706,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, const uint8_t *_thresh, int count, int bd) { const __m128i zero = _mm_set1_epi16(0); - const __m128i blimit = _mm_slli_epi16( - _mm_unpacklo_epi8( - _mm_load_si128((const __m128i *)_blimit), zero), bd - 8); - const __m128i limit = _mm_slli_epi16( - _mm_unpacklo_epi8( - _mm_load_si128((const __m128i *)_limit), zero), bd - 8); - const __m128i thresh = _mm_slli_epi16( - _mm_unpacklo_epi8( - _mm_load_si128((const __m128i *)_thresh), zero), bd - 8); + __m128i blimit, limit, thresh; __m128i mask, hev, flat; __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); @@ -737,30 +729,63 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, __m128i work; const __m128i t4 = _mm_set1_epi16(4); const __m128i t3 = _mm_set1_epi16(3); - const __m128i t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), bd - 8); - const __m128i tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), bd - 8); - const __m128i tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), bd - 8); - const __m128i t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 16 - bd); + __m128i t80; + __m128i tff80; + __m128i tffe0; + __m128i t1f; // equivalent to shifting 0x1f left by bitdepth - 8 // and setting new bits to 1 const __m128i t1 = _mm_set1_epi16(0x1); - const __m128i t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 16 - bd); + __m128i t7f; // equivalent to shifting 0x7f left by bitdepth - 8 // and setting new bits to 1 - const __m128i ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), - t80); - const __m128i ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), - t80); - const __m128i qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), - t80); - const __m128i qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), - t80); + __m128i ps1, ps0, qs0, qs1; __m128i filt; __m128i work_a; __m128i filter1, filter2; (void)count; + if (bd == 8) { + blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero); + limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero); + thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero); + t80 = _mm_set1_epi16(0x80); + tff80 = _mm_set1_epi16(0xff80); + tffe0 = _mm_set1_epi16(0xffe0); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8); + } else if (bd == 10) { + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2); + t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2); + tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2); + tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6); + } else { // bd == 12 + blimit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4); + limit = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4); + thresh = _mm_slli_epi16( + _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4); + t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4); + tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4); + tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4); + t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4); + t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4); + } + + ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); + ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); + qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); + qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); + // filter_mask and hev_mask flat = _mm_max_epi16(abs_p1p0, abs_q1q0); hev = _mm_subs_epu16(flat, thresh); @@ -796,6 +821,7 @@ void vp9_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p, filt = _mm_adds_epi16(filt, work_a); filt = _mm_adds_epi16(filt, work_a); filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd); + // (vp9_filter + 3 * (qs0 - ps0)) & mask filt = _mm_and_si128(filt, mask); -- 2.40.0