From 7498fe2e542183ff6e8091608ae57fade2bde4ee Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 12 May 2017 18:16:30 -0700 Subject: [PATCH] neon 4 byte helper functions When data is guaranteed to be aligned, use helper functions which assert that requirement. Change-Id: Ic4b188593aea0799d5bd8eda64f9858a1592a2a3 --- vpx_dsp/arm/idct4x4_add_neon.c | 19 ++++++++----------- vpx_dsp/arm/mem_neon.h | 25 +++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/vpx_dsp/arm/idct4x4_add_neon.c b/vpx_dsp/arm/idct4x4_add_neon.c index e44ba6e75..673a36840 100644 --- a/vpx_dsp/arm/idct4x4_add_neon.c +++ b/vpx_dsp/arm/idct4x4_add_neon.c @@ -20,7 +20,7 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { const uint8_t *dst = dest; const int16x4_t cospis = vld1_s16(kCospi); - uint32x2_t dest01_u32 = vdup_n_u32(0); + uint8x8_t dest01_u8; uint32x2_t dest32_u32 = vdup_n_u32(0); int16x8_t a0, a1; uint8x8_t d01, d32; @@ -40,25 +40,22 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, a0 = vrshrq_n_s16(a0, 4); a1 = vrshrq_n_s16(a1, 4); - dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 0); - dst += stride; - dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 1); - dst += stride; + dest01_u8 = load_u8(dst, stride); + dst += 2 * stride; + // The elements are loaded in reverse order. dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1); dst += stride; dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0); - d01_u16 = - vaddw_u8(vreinterpretq_u16_s16(a0), vreinterpret_u8_u32(dest01_u32)); + d01_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), dest01_u8); d32_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32)); d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16)); d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16)); - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 0); - dest += stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 1); - dest += stride; + store_u8(dest, stride, d01); + dest += 2 * stride; + // The elements are stored in reverse order. vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1); dest += stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0); diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index ef6e9decd..ba5c3d513 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -68,4 +68,29 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { vst1q_s16(buf, a); #endif } + +// Load 2 sets of 4 bytes when alignment is guaranteed. +static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) { + uint32x2_t a = vdup_n_u32(0); + + assert(!((intptr_t)buf % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + a = vld1_lane_u32((const uint32_t *)buf, a, 0); + buf += stride; + a = vld1_lane_u32((const uint32_t *)buf, a, 1); + return vreinterpret_u8_u32(a); +} + +// Store 2 sets of 4 bytes when alignment is guaranteed. +static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) { + uint32x2_t a_u32 = vreinterpret_u32_u8(a); + + assert(!((intptr_t)buf % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + vst1_lane_u32((uint32_t *)buf, a_u32, 0); + buf += stride; + vst1_lane_u32((uint32_t *)buf, a_u32, 1); +} #endif // VPX_DSP_ARM_MEM_NEON_H_ -- 2.40.0