From: Johann Date: Wed, 27 Jul 2016 21:24:14 +0000 (-0700) Subject: Don't expand to Q register for 4x4 intrapred X-Git-Tag: v1.6.1~346^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=df69c751a7552fa162fbcf64da14830c753342f3;p=libvpx Don't expand to Q register for 4x4 intrapred The code was expanding to Q registers so that vqrshn could be used, for vector quad round shift and narrow. If 4 values are added together, there is a shift by 2. If 8 values, a shift by 3. Since this accounts for any possibility of overflow, we can skip the narrowing shift. This allows keeping the values in D registers and casting the 16 bit value to 8 bits. Change-Id: I8d9cfa07176271f492c116ffa6a7b351af0b8751 --- diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 32dd1ba14..38e79ed69 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -20,37 +20,35 @@ // 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; + uint16x4_t sum_top; + uint16x4_t sum_left; + uint16x4_t dc0; if (do_above) { const uint8x8_t A = vld1_u8(above); // top row const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top - const uint16x4_t p1 = vpadd_u16(p0, p0); - sum_top = vcombine_u16(p1, p1); + sum_top = vpadd_u16(p0, p0); } if (do_left) { const uint8x8_t L = vld1_u8(left); // left border const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left - const uint16x4_t p1 = vpadd_u16(p0, p0); - sum_left = vcombine_u16(p1, p1); + sum_left = vpadd_u16(p0, p0); } if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 3); + const uint16x4_t sum = vadd_u16(sum_left, sum_top); + dc0 = vrshr_n_u16(sum, 3); } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 2); + dc0 = vrshr_n_u16(sum_top, 2); } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 2); + dc0 = vrshr_n_u16(sum_left, 2); } else { - dc0 = vdup_n_u8(0x80); + dc0 = vdup_n_u16(0x80); } { - const uint8x8_t dc = vdup_lane_u8(dc0, 0); + const uint8x8_t dc = vdup_lane_u8(vreinterpret_u8_u16(dc0), 0); int i; for (i = 0; i < 4; ++i) { vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0);