From: Linfeng Zhang Date: Wed, 5 Apr 2017 21:41:35 +0000 (-0700) Subject: Update 32x32 high bitdepth idct NEON optimization X-Git-Tag: v1.7.0~579 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=6fc2e57c2ca6c9a7b1eecb2c7d93b65222b6727d;p=libvpx Update 32x32 high bitdepth idct NEON optimization Preparation of CONVERT_TO_BYTEPTR/SHORTPTR clean up. BUG=webm:1388 Change-Id: I928d30a5698023bb90888d783cf81c51ec183760 --- diff --git a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c index ca3c3bee4..52f3d43e5 100644 --- a/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c +++ b/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c @@ -726,9 +726,10 @@ static void vpx_highbd_idct32_16_neon(const int32_t *const input, highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); } -void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, +void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); if (bd == 8) { int16_t temp[32 * 16]; @@ -742,16 +743,15 @@ void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, dest += 8; } } else { - uint16_t *dst = CONVERT_TO_SHORTPTR(dest); int32_t temp[32 * 16]; int32_t *t = temp; vpx_highbd_idct32_12_neon(input, temp); vpx_highbd_idct32_12_neon(input + 32 * 8, temp + 8); for (i = 0; i < 32; i += 8) { - vpx_highbd_idct32_16_neon(t, dst, stride, bd); + vpx_highbd_idct32_16_neon(t, dest, stride, bd); t += (16 * 8); - dst += 8; + dest += 8; } } } diff --git a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c index 6b98ad548..195dcc92d 100644 --- a/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c +++ b/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c @@ -594,9 +594,10 @@ static void vpx_highbd_idct32_8_neon(const int32_t *input, uint16_t *output, highbd_idct16x16_add_store(out + 16, output + 16 * stride, stride, bd); } -void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, +void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); if (bd == 8) { int16_t temp[32 * 8]; @@ -610,16 +611,15 @@ void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, dest += 8; } } else { - uint16_t *dst = CONVERT_TO_SHORTPTR(dest); int32_t temp[32 * 8]; int32_t *t = temp; vpx_highbd_idct32_6_neon(input, t); for (i = 0; i < 32; i += 8) { - vpx_highbd_idct32_8_neon(t, dst, stride, bd); + vpx_highbd_idct32_8_neon(t, dest, stride, bd); t += (8 * 8); - dst += 8; + dest += 8; } } } diff --git a/vpx_dsp/arm/idct32x32_135_add_neon.c b/vpx_dsp/arm/idct32x32_135_add_neon.c index f8be96874..b39825991 100644 --- a/vpx_dsp/arm/idct32x32_135_add_neon.c +++ b/vpx_dsp/arm/idct32x32_135_add_neon.c @@ -371,7 +371,7 @@ void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output) { vst1q_s16(output, vsubq_s16(s7[0], s6[31])); } -void vpx_idct32_16_neon(const int16_t *const input, uint8_t *const output, +void vpx_idct32_16_neon(const int16_t *const input, void *const output, const int stride, const int highbd_flag) { int16x8_t in[16], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], out[32]; @@ -646,17 +646,17 @@ void vpx_idct32_16_neon(const int16_t *const input, uint8_t *const output, out[31] = final_sub(s7[0], s6[31]); if (highbd_flag) { - uint16_t *const outputT = CONVERT_TO_SHORTPTR(output); - highbd_add_and_store_bd8(out, outputT, stride); + highbd_add_and_store_bd8(out, output, stride); } else { + uint8_t *const outputT = (uint8_t *)output; add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6], - out[7], output, stride); + out[7], outputT, stride); add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13], - out[14], out[15], output + (8 * stride), stride); + out[14], out[15], outputT + (8 * stride), stride); add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21], - out[22], out[23], output + (16 * stride), stride); + out[22], out[23], outputT + (16 * stride), stride); add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29], - out[30], out[31], output + (24 * stride), stride); + out[30], out[31], outputT + (24 * stride), stride); } } diff --git a/vpx_dsp/arm/idct32x32_34_add_neon.c b/vpx_dsp/arm/idct32x32_34_add_neon.c index 99dd7164b..fc0c4cd84 100644 --- a/vpx_dsp/arm/idct32x32_34_add_neon.c +++ b/vpx_dsp/arm/idct32x32_34_add_neon.c @@ -265,7 +265,7 @@ void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output) { vst1q_s16(output, vsubq_s16(s1[0], s2[31])); } -void vpx_idct32_8_neon(const int16_t *input, uint8_t *output, int stride, +void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, const int highbd_flag) { int16x8_t in[8], s1[32], s2[32], s3[32], out[32]; @@ -486,17 +486,17 @@ void vpx_idct32_8_neon(const int16_t *input, uint8_t *output, int stride, out[31] = final_sub(s1[0], s2[31]); if (highbd_flag) { - uint16_t *const outputT = CONVERT_TO_SHORTPTR(output); - highbd_add_and_store_bd8(out, outputT, stride); + highbd_add_and_store_bd8(out, output, stride); } else { + uint8_t *const outputT = (uint8_t *)output; add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6], - out[7], output, stride); + out[7], outputT, stride); add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13], - out[14], out[15], output + (8 * stride), stride); + out[14], out[15], outputT + (8 * stride), stride); add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21], - out[22], out[23], output + (16 * stride), stride); + out[22], out[23], outputT + (16 * stride), stride); add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29], - out[30], out[31], output + (24 * stride), stride); + out[30], out[31], outputT + (24 * stride), stride); } } diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h index fc4558c01..27c784edc 100644 --- a/vpx_dsp/arm/idct_neon.h +++ b/vpx_dsp/arm/idct_neon.h @@ -982,11 +982,11 @@ void vpx_idct32_32_neon(const tran_low_t *input, uint8_t *dest, const int stride, const int highbd_flag); void vpx_idct32_12_neon(const tran_low_t *const input, int16_t *output); -void vpx_idct32_16_neon(const int16_t *const input, uint8_t *const output, +void vpx_idct32_16_neon(const int16_t *const input, void *const output, const int stride, const int highbd_flag); void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output); -void vpx_idct32_8_neon(const int16_t *input, uint8_t *output, int stride, +void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, const int highbd_flag); #endif // VPX_DSP_ARM_IDCT_NEON_H_