+ if (bd == 8) {
+ blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
+ limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
+ thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
+ t80 = _mm_set1_epi16(0x80);
+ tff80 = _mm_set1_epi16(0xff80);
+ tffe0 = _mm_set1_epi16(0xffe0);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
+ } else if (bd == 10) {
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
+ t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
+ tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
+ tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
+ } else { // bd == 12
+ blimit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
+ limit = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
+ thresh = _mm_slli_epi16(
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
+ t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
+ tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
+ tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
+ t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
+ t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
+ }
+
+ ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
+ ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
+ qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
+ qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
+