2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
13 #include "./vpx_config.h"
14 #include "./vp9_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vp9/common/vp9_reconinter.h"
18 #include "vp9/encoder/vp9_context_tree.h"
19 #include "vp9/encoder/vp9_denoiser.h"
20 #include "vpx_mem/vpx_mem.h"
22 // Compute the sum of all pixel differences of this MB.
23 static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
24 const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
25 const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
26 const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
27 const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
28 vget_low_s64(fedcba98_76543210));
29 const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
33 // Denoise a 16x1 vector.
34 static INLINE int8x16_t denoiser_16x1_neon(
35 const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
36 const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
37 const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
38 const uint8x16_t v_delta_level_1_and_2,
39 const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
40 const uint8x16_t v_sig = vld1q_u8(sig);
41 const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
43 /* Calculate absolute difference and sign masks. */
44 const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
45 const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
46 const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
48 /* Figure out which level that put us in. */
49 const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
50 const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
51 const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
53 /* Calculate absolute adjustments for level 1, 2 and 3. */
54 const uint8x16_t v_level2_adjustment =
55 vandq_u8(v_level2_mask, v_delta_level_1_and_2);
56 const uint8x16_t v_level3_adjustment =
57 vandq_u8(v_level3_mask, v_delta_level_2_and_3);
58 const uint8x16_t v_level1and2_adjustment =
59 vaddq_u8(v_level1_adjustment, v_level2_adjustment);
60 const uint8x16_t v_level1and2and3_adjustment =
61 vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
63 /* Figure adjustment absolute value by selecting between the absolute
64 * difference if in level0 or the value for level 1, 2 and 3.
66 const uint8x16_t v_abs_adjustment =
67 vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
69 /* Calculate positive and negative adjustments. Apply them to the signal
70 * and accumulate them. Adjustments are less than eight and the maximum
71 * sum of them (7 * 16) can fit in a signed char.
73 const uint8x16_t v_pos_adjustment =
74 vandq_u8(v_diff_pos_mask, v_abs_adjustment);
75 const uint8x16_t v_neg_adjustment =
76 vandq_u8(v_diff_neg_mask, v_abs_adjustment);
78 uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
79 v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
82 vst1q_u8(running_avg_y, v_running_avg_y);
84 /* Sum all the accumulators to have the sum of all pixel differences
85 * for this macroblock.
88 const int8x16_t v_sum_diff =
89 vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
90 vreinterpretq_s8_u8(v_neg_adjustment));
91 v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
93 return v_sum_diff_total;
96 static INLINE int8x16_t denoiser_adjust_16x1_neon(
97 const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
98 const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
99 uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
100 const uint8x16_t v_sig = vld1q_u8(sig);
101 const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
103 /* Calculate absolute difference and sign masks. */
104 const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
105 const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
106 const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
107 // Clamp absolute difference to delta to get the adjustment.
108 const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
110 const uint8x16_t v_pos_adjustment =
111 vandq_u8(v_diff_pos_mask, v_abs_adjustment);
112 const uint8x16_t v_neg_adjustment =
113 vandq_u8(v_diff_neg_mask, v_abs_adjustment);
115 v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
116 v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
119 vst1q_u8(running_avg_y, v_running_avg_y);
122 const int8x16_t v_sum_diff =
123 vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
124 vreinterpretq_s8_u8(v_pos_adjustment));
125 v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
127 return v_sum_diff_total;
130 // Denoise 8x8 and 8x16 blocks.
131 static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
132 const uint8_t *mc_running_avg_y,
133 int mc_avg_y_stride, uint8_t *running_avg_y,
134 int avg_y_stride, int increase_denoising,
135 BLOCK_SIZE bs, int motion_magnitude,
137 int sum_diff_thresh, r, sum_diff = 0;
138 const int shift_inc =
139 (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
142 uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
144 const uint8x16_t v_level1_adjustment = vmovq_n_u8(
145 (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
146 const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
147 const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
148 const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
149 const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
150 const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
152 const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
154 int8x16_t v_sum_diff_total = vdupq_n_s8(0);
156 for (r = 0; r < b_height; ++r) {
157 memcpy(sig_buffer[r], sig, width);
158 memcpy(sig_buffer[r] + width, sig + sig_stride, width);
159 memcpy(mc_running_buffer[r], mc_running_avg_y, width);
160 memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
162 memcpy(running_buffer[r], running_avg_y, width);
163 memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
164 v_sum_diff_total = denoiser_16x1_neon(
165 sig_buffer[r], mc_running_buffer[r], running_buffer[r],
166 v_level1_threshold, v_level2_threshold, v_level3_threshold,
167 v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
170 const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
171 const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
172 const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
173 vst1_u8(running_avg_y, v_running_buffer_low);
174 vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
176 // Update pointers for next iteration.
177 sig += (sig_stride << 1);
178 mc_running_avg_y += (mc_avg_y_stride << 1);
179 running_avg_y += (avg_y_stride << 1);
183 sum_diff = horizontal_add_s8x16(v_sum_diff_total);
184 sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
185 if (abs(sum_diff) > sum_diff_thresh) {
186 // Before returning to copy the block (i.e., apply no denoising),
187 // check if we can still apply some (weaker) temporal filtering to
188 // this block, that would otherwise not be denoised at all. Simplest
189 // is to apply an additional adjustment to running_avg_y to bring it
190 // closer to sig. The adjustment is capped by a maximum delta, and
191 // chosen such that in most cases the resulting sum_diff will be
192 // within the acceptable range given by sum_diff_thresh.
194 // The delta is set by the excess of absolute pixel diff over the
197 ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
198 // Only apply the adjustment for max delta up to 3.
200 const uint8x16_t k_delta = vmovq_n_u8(delta);
201 running_avg_y -= avg_y_stride * (b_height << 1);
202 for (r = 0; r < b_height; ++r) {
203 v_sum_diff_total = denoiser_adjust_16x1_neon(
204 sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
207 const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
208 const uint8x8_t v_running_buffer_high =
209 vget_high_u8(v_running_buffer);
210 const uint8x8_t v_running_buffer_low =
211 vget_low_u8(v_running_buffer);
212 vst1_u8(running_avg_y, v_running_buffer_low);
213 vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
215 // Update pointers for next iteration.
216 running_avg_y += (avg_y_stride << 1);
218 sum_diff = horizontal_add_s8x16(v_sum_diff_total);
219 if (abs(sum_diff) > sum_diff_thresh) {
231 // Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
232 static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
233 const uint8_t *mc_running_avg_y,
234 int mc_avg_y_stride, uint8_t *running_avg_y,
235 int avg_y_stride, int increase_denoising,
236 BLOCK_SIZE bs, int motion_magnitude) {
237 const int shift_inc =
238 (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
241 const uint8x16_t v_level1_adjustment = vmovq_n_u8(
242 (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
243 const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
244 const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
245 const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
246 const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
247 const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
249 const int b_width = (4 << b_width_log2_lookup[bs]);
250 const int b_height = (4 << b_height_log2_lookup[bs]);
251 const int b_width_shift4 = b_width >> 4;
253 int8x16_t v_sum_diff_total[4][4];
254 int r, c, sum_diff = 0;
256 for (r = 0; r < 4; ++r) {
257 for (c = 0; c < b_width_shift4; ++c) {
258 v_sum_diff_total[c][r] = vdupq_n_s8(0);
262 for (r = 0; r < b_height; ++r) {
263 for (c = 0; c < b_width_shift4; ++c) {
264 v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
265 sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
266 v_level2_threshold, v_level3_threshold, v_level1_adjustment,
267 v_delta_level_1_and_2, v_delta_level_2_and_3,
268 v_sum_diff_total[c][r >> 4]);
270 // Update pointers for next iteration.
272 mc_running_avg_y += 16;
276 if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
277 for (c = 0; c < b_width_shift4; ++c) {
278 sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
282 // Update pointers for next iteration.
283 sig = sig - b_width + sig_stride;
284 mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
285 running_avg_y = running_avg_y - b_width + avg_y_stride;
289 const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
290 if (abs(sum_diff) > sum_diff_thresh) {
292 ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
293 // Only apply the adjustment for max delta up to 3.
295 const uint8x16_t k_delta = vdupq_n_u8(delta);
296 sig -= sig_stride * b_height;
297 mc_running_avg_y -= mc_avg_y_stride * b_height;
298 running_avg_y -= avg_y_stride * b_height;
301 for (r = 0; r < b_height; ++r) {
302 for (c = 0; c < b_width_shift4; ++c) {
303 v_sum_diff_total[c][r >> 4] =
304 denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
305 k_delta, v_sum_diff_total[c][r >> 4]);
307 // Update pointers for next iteration.
309 mc_running_avg_y += 16;
312 if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
313 for (c = 0; c < b_width_shift4; ++c) {
314 sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
318 sig = sig - b_width + sig_stride;
319 mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
320 running_avg_y = running_avg_y - b_width + avg_y_stride;
323 if (abs(sum_diff) > sum_diff_thresh) {
334 int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
335 const uint8_t *mc_avg, int mc_avg_stride,
336 uint8_t *avg, int avg_stride,
337 int increase_denoising, BLOCK_SIZE bs,
338 int motion_magnitude) {
339 // Rank by frequency of the block type to have an early termination.
340 if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
341 bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
342 bs == BLOCK_32X64 || bs == BLOCK_64X32) {
343 return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
344 avg_stride, increase_denoising, bs,
346 } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
347 return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
348 avg_stride, increase_denoising, bs,
349 motion_magnitude, 8);