From ff209de82b90ac2361da0bd40497a08add25920b Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Wed, 30 Apr 2014 06:58:16 -0700 Subject: [PATCH] Improved intrinsic version of vp8_denoiser_filter_neon Used horizonal add instructions instead of adding byte lanes. The encoder performance improved by ~4% for the test clip used. Change-Id: Iaddd10403fcffb5b3f53b1f591ab2fe0ff002c08 --- vp8/encoder/arm/neon/denoising_neon.c | 59 +++++++++++++-------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/vp8/encoder/arm/neon/denoising_neon.c b/vp8/encoder/arm/neon/denoising_neon.c index 3f8539759..23dc0a967 100644 --- a/vp8/encoder/arm/neon/denoising_neon.c +++ b/vp8/encoder/arm/neon/denoising_neon.c @@ -68,14 +68,11 @@ int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg, int mc_running_avg_y_stride = mc_running_avg->y_stride; unsigned char *running_avg_y = running_avg->y_buffer + y_offset; int running_avg_y_stride = running_avg->y_stride; + int64x2_t v_sum_diff_total = vdupq_n_s64(0); /* Go over lines. */ int i; - int sum_diff = 0; for (i = 0; i < 16; ++i) { - int8x16_t v_sum_diff = vdupq_n_s8(0); - uint8x16_t v_running_avg_y; - /* Load inputs. */ const uint8x16_t v_sig = vld1q_u8(sig); const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); @@ -117,12 +114,9 @@ int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg, v_abs_adjustment); const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, v_abs_adjustment); - v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + + uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); - v_sum_diff = vqaddq_s8(v_sum_diff, - vreinterpretq_s8_u8(v_pos_adjustment)); - v_sum_diff = vqsubq_s8(v_sum_diff, - vreinterpretq_s8_u8(v_neg_adjustment)); /* Store results. */ vst1q_u8(running_avg_y, v_running_avg_y); @@ -131,23 +125,19 @@ int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg, * for this macroblock. */ { - int s0 = vgetq_lane_s8(v_sum_diff, 0) + - vgetq_lane_s8(v_sum_diff, 1) + - vgetq_lane_s8(v_sum_diff, 2) + - vgetq_lane_s8(v_sum_diff, 3); - int s1 = vgetq_lane_s8(v_sum_diff, 4) + - vgetq_lane_s8(v_sum_diff, 5) + - vgetq_lane_s8(v_sum_diff, 6) + - vgetq_lane_s8(v_sum_diff, 7); - int s2 = vgetq_lane_s8(v_sum_diff, 8) + - vgetq_lane_s8(v_sum_diff, 9) + - vgetq_lane_s8(v_sum_diff, 10) + - vgetq_lane_s8(v_sum_diff, 11); - int s3 = vgetq_lane_s8(v_sum_diff, 12) + - vgetq_lane_s8(v_sum_diff, 13) + - vgetq_lane_s8(v_sum_diff, 14) + - vgetq_lane_s8(v_sum_diff, 15); - sum_diff += s0 + s1+ s2 + s3; + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff); + + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210); } /* Update pointers for next iteration. */ @@ -157,11 +147,20 @@ int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg, } /* Too much adjustments => copy block. */ - if (abs(sum_diff) > SUM_DIFF_THRESHOLD) - return COPY_BLOCK; + { + const int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + const int s0 = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + + if (s0 > SUM_DIFF_THRESHOLD) + return COPY_BLOCK; + } /* Tell above level that block was filtered. */ - vp8_copy_mem16x16(running_avg->y_buffer + y_offset, running_avg_y_stride, - signal->thismb, sig_stride); + running_avg_y -= running_avg_y_stride * 16; + sig -= sig_stride * 16; + + vp8_copy_mem16x16(running_avg_y, running_avg_y_stride, sig, sig_stride); + return FILTER_BLOCK; } -- 2.40.0