granicus.if.org Git - libvpx/blob - vp9/encoder/arm/neon/vp9_denoiser_neon.c

   1 /*
   2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <arm_neon.h>
  12
  13 #include "./vpx_config.h"
  14 #include "./vp9_rtcd.h"
  15
  16 #include "vpx/vpx_integer.h"
  17 #include "vp9/common/vp9_reconinter.h"
  18 #include "vp9/encoder/vp9_context_tree.h"
  19 #include "vp9/encoder/vp9_denoiser.h"
  20 #include "vpx_mem/vpx_mem.h"
  21
  22 // Compute the sum of all pixel differences of this MB.
  23 static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
  24   const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
  25   const int32x4_t fedc_ba98_7654_3210 = vpaddlq_s16(fe_dc_ba_98_76_54_32_10);
  26   const int64x2_t fedcba98_76543210 = vpaddlq_s32(fedc_ba98_7654_3210);
  27   const int64x1_t x = vqadd_s64(vget_high_s64(fedcba98_76543210),
  28                                 vget_low_s64(fedcba98_76543210));
  29   const int sum_diff = vget_lane_s32(vreinterpret_s32_s64(x), 0);
  30   return sum_diff;
  31 }
  32
  33 // Denoise a 16x1 vector.
  34 static INLINE int8x16_t denoiser_16x1_neon(
  35     const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
  36     const uint8x16_t v_level1_threshold, const uint8x16_t v_level2_threshold,
  37     const uint8x16_t v_level3_threshold, const uint8x16_t v_level1_adjustment,
  38     const uint8x16_t v_delta_level_1_and_2,
  39     const uint8x16_t v_delta_level_2_and_3, int8x16_t v_sum_diff_total) {
  40   const uint8x16_t v_sig = vld1q_u8(sig);
  41   const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
  42
  43   /* Calculate absolute difference and sign masks. */
  44   const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
  45   const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
  46   const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
  47
  48   /* Figure out which level that put us in. */
  49   const uint8x16_t v_level1_mask = vcleq_u8(v_level1_threshold, v_abs_diff);
  50   const uint8x16_t v_level2_mask = vcleq_u8(v_level2_threshold, v_abs_diff);
  51   const uint8x16_t v_level3_mask = vcleq_u8(v_level3_threshold, v_abs_diff);
  52
  53   /* Calculate absolute adjustments for level 1, 2 and 3. */
  54   const uint8x16_t v_level2_adjustment =
  55       vandq_u8(v_level2_mask, v_delta_level_1_and_2);
  56   const uint8x16_t v_level3_adjustment =
  57       vandq_u8(v_level3_mask, v_delta_level_2_and_3);
  58   const uint8x16_t v_level1and2_adjustment =
  59       vaddq_u8(v_level1_adjustment, v_level2_adjustment);
  60   const uint8x16_t v_level1and2and3_adjustment =
  61       vaddq_u8(v_level1and2_adjustment, v_level3_adjustment);
  62
  63   /* Figure adjustment absolute value by selecting between the absolute
  64    * difference if in level0 or the value for level 1, 2 and 3.
  65    */
  66   const uint8x16_t v_abs_adjustment =
  67       vbslq_u8(v_level1_mask, v_level1and2and3_adjustment, v_abs_diff);
  68
  69   /* Calculate positive and negative adjustments. Apply them to the signal
  70    * and accumulate them. Adjustments are less than eight and the maximum
  71    * sum of them (7 * 16) can fit in a signed char.
  72    */
  73   const uint8x16_t v_pos_adjustment =
  74       vandq_u8(v_diff_pos_mask, v_abs_adjustment);
  75   const uint8x16_t v_neg_adjustment =
  76       vandq_u8(v_diff_neg_mask, v_abs_adjustment);
  77
  78   uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment);
  79   v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment);
  80
  81   /* Store results. */
  82   vst1q_u8(running_avg_y, v_running_avg_y);
  83
  84   /* Sum all the accumulators to have the sum of all pixel differences
  85    * for this macroblock.
  86    */
  87   {
  88     const int8x16_t v_sum_diff =
  89         vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment),
  90                   vreinterpretq_s8_u8(v_neg_adjustment));
  91     v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
  92   }
  93   return v_sum_diff_total;
  94 }
  95
  96 static INLINE int8x16_t denoiser_adjust_16x1_neon(
  97     const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
  98     const uint8x16_t k_delta, int8x16_t v_sum_diff_total) {
  99   uint8x16_t v_running_avg_y = vld1q_u8(running_avg_y);
 100   const uint8x16_t v_sig = vld1q_u8(sig);
 101   const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y);
 102
 103   /* Calculate absolute difference and sign masks. */
 104   const uint8x16_t v_abs_diff = vabdq_u8(v_sig, v_mc_running_avg_y);
 105   const uint8x16_t v_diff_pos_mask = vcltq_u8(v_sig, v_mc_running_avg_y);
 106   const uint8x16_t v_diff_neg_mask = vcgtq_u8(v_sig, v_mc_running_avg_y);
 107   // Clamp absolute difference to delta to get the adjustment.
 108   const uint8x16_t v_abs_adjustment = vminq_u8(v_abs_diff, (k_delta));
 109
 110   const uint8x16_t v_pos_adjustment =
 111       vandq_u8(v_diff_pos_mask, v_abs_adjustment);
 112   const uint8x16_t v_neg_adjustment =
 113       vandq_u8(v_diff_neg_mask, v_abs_adjustment);
 114
 115   v_running_avg_y = vqsubq_u8(v_running_avg_y, v_pos_adjustment);
 116   v_running_avg_y = vqaddq_u8(v_running_avg_y, v_neg_adjustment);
 117
 118   /* Store results. */
 119   vst1q_u8(running_avg_y, v_running_avg_y);
 120
 121   {
 122     const int8x16_t v_sum_diff =
 123         vqsubq_s8(vreinterpretq_s8_u8(v_neg_adjustment),
 124                   vreinterpretq_s8_u8(v_pos_adjustment));
 125     v_sum_diff_total = vaddq_s8(v_sum_diff_total, v_sum_diff);
 126   }
 127   return v_sum_diff_total;
 128 }
 129
 130 // Denoise 8x8 and 8x16 blocks.
 131 static int vp9_denoiser_8xN_neon(const uint8_t *sig, int sig_stride,
 132                                  const uint8_t *mc_running_avg_y,
 133                                  int mc_avg_y_stride, uint8_t *running_avg_y,
 134                                  int avg_y_stride, int increase_denoising,
 135                                  BLOCK_SIZE bs, int motion_magnitude,
 136                                  int width) {
 137   int sum_diff_thresh, r, sum_diff = 0;
 138   const int shift_inc =
 139       (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
 140           ? 1
 141           : 0;
 142   uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
 143
 144   const uint8x16_t v_level1_adjustment = vmovq_n_u8(
 145       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
 146   const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
 147   const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
 148   const uint8x16_t v_level1_threshold = vdupq_n_u8(4 + shift_inc);
 149   const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
 150   const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
 151
 152   const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
 153
 154   int8x16_t v_sum_diff_total = vdupq_n_s8(0);
 155
 156   for (r = 0; r < b_height; ++r) {
 157     memcpy(sig_buffer[r], sig, width);
 158     memcpy(sig_buffer[r] + width, sig + sig_stride, width);
 159     memcpy(mc_running_buffer[r], mc_running_avg_y, width);
 160     memcpy(mc_running_buffer[r] + width, mc_running_avg_y + mc_avg_y_stride,
 161            width);
 162     memcpy(running_buffer[r], running_avg_y, width);
 163     memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
 164     v_sum_diff_total = denoiser_16x1_neon(
 165         sig_buffer[r], mc_running_buffer[r], running_buffer[r],
 166         v_level1_threshold, v_level2_threshold, v_level3_threshold,
 167         v_level1_adjustment, v_delta_level_1_and_2, v_delta_level_2_and_3,
 168         v_sum_diff_total);
 169     {
 170       const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
 171       const uint8x8_t v_running_buffer_high = vget_high_u8(v_running_buffer);
 172       const uint8x8_t v_running_buffer_low = vget_low_u8(v_running_buffer);
 173       vst1_u8(running_avg_y, v_running_buffer_low);
 174       vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
 175     }
 176     // Update pointers for next iteration.
 177     sig += (sig_stride << 1);
 178     mc_running_avg_y += (mc_avg_y_stride << 1);
 179     running_avg_y += (avg_y_stride << 1);
 180   }
 181
 182   {
 183     sum_diff = horizontal_add_s8x16(v_sum_diff_total);
 184     sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
 185     if (abs(sum_diff) > sum_diff_thresh) {
 186       // Before returning to copy the block (i.e., apply no denoising),
 187       // check if we can still apply some (weaker) temporal filtering to
 188       // this block, that would otherwise not be denoised at all. Simplest
 189       // is to apply an additional adjustment to running_avg_y to bring it
 190       // closer to sig. The adjustment is capped by a maximum delta, and
 191       // chosen such that in most cases the resulting sum_diff will be
 192       // within the acceptable range given by sum_diff_thresh.
 193
 194       // The delta is set by the excess of absolute pixel diff over the
 195       // threshold.
 196       const int delta =
 197           ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
 198       // Only apply the adjustment for max delta up to 3.
 199       if (delta < 4) {
 200         const uint8x16_t k_delta = vmovq_n_u8(delta);
 201         running_avg_y -= avg_y_stride * (b_height << 1);
 202         for (r = 0; r < b_height; ++r) {
 203           v_sum_diff_total = denoiser_adjust_16x1_neon(
 204               sig_buffer[r], mc_running_buffer[r], running_buffer[r], k_delta,
 205               v_sum_diff_total);
 206           {
 207             const uint8x16_t v_running_buffer = vld1q_u8(running_buffer[r]);
 208             const uint8x8_t v_running_buffer_high =
 209                 vget_high_u8(v_running_buffer);
 210             const uint8x8_t v_running_buffer_low =
 211                 vget_low_u8(v_running_buffer);
 212             vst1_u8(running_avg_y, v_running_buffer_low);
 213             vst1_u8(running_avg_y + avg_y_stride, v_running_buffer_high);
 214           }
 215           // Update pointers for next iteration.
 216           running_avg_y += (avg_y_stride << 1);
 217         }
 218         sum_diff = horizontal_add_s8x16(v_sum_diff_total);
 219         if (abs(sum_diff) > sum_diff_thresh) {
 220           return COPY_BLOCK;
 221         }
 222       } else {
 223         return COPY_BLOCK;
 224       }
 225     }
 226   }
 227
 228   return FILTER_BLOCK;
 229 }
 230
 231 // Denoise 16x16, 16x32, 32x16, 32x32, 32x64, 64x32 and 64x64 blocks.
 232 static int vp9_denoiser_NxM_neon(const uint8_t *sig, int sig_stride,
 233                                  const uint8_t *mc_running_avg_y,
 234                                  int mc_avg_y_stride, uint8_t *running_avg_y,
 235                                  int avg_y_stride, int increase_denoising,
 236                                  BLOCK_SIZE bs, int motion_magnitude) {
 237   const int shift_inc =
 238       (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
 239           ? 1
 240           : 0;
 241   const uint8x16_t v_level1_adjustment = vmovq_n_u8(
 242       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 4 + shift_inc : 3);
 243   const uint8x16_t v_delta_level_1_and_2 = vdupq_n_u8(1);
 244   const uint8x16_t v_delta_level_2_and_3 = vdupq_n_u8(2);
 245   const uint8x16_t v_level1_threshold = vmovq_n_u8(4 + shift_inc);
 246   const uint8x16_t v_level2_threshold = vdupq_n_u8(8);
 247   const uint8x16_t v_level3_threshold = vdupq_n_u8(16);
 248
 249   const int b_width = (4 << b_width_log2_lookup[bs]);
 250   const int b_height = (4 << b_height_log2_lookup[bs]);
 251   const int b_width_shift4 = b_width >> 4;
 252
 253   int8x16_t v_sum_diff_total[4][4];
 254   int r, c, sum_diff = 0;
 255
 256   for (r = 0; r < 4; ++r) {
 257     for (c = 0; c < b_width_shift4; ++c) {
 258       v_sum_diff_total[c][r] = vdupq_n_s8(0);
 259     }
 260   }
 261
 262   for (r = 0; r < b_height; ++r) {
 263     for (c = 0; c < b_width_shift4; ++c) {
 264       v_sum_diff_total[c][r >> 4] = denoiser_16x1_neon(
 265           sig, mc_running_avg_y, running_avg_y, v_level1_threshold,
 266           v_level2_threshold, v_level3_threshold, v_level1_adjustment,
 267           v_delta_level_1_and_2, v_delta_level_2_and_3,
 268           v_sum_diff_total[c][r >> 4]);
 269
 270       // Update pointers for next iteration.
 271       sig += 16;
 272       mc_running_avg_y += 16;
 273       running_avg_y += 16;
 274     }
 275
 276     if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
 277       for (c = 0; c < b_width_shift4; ++c) {
 278         sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
 279       }
 280     }
 281
 282     // Update pointers for next iteration.
 283     sig = sig - b_width + sig_stride;
 284     mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
 285     running_avg_y = running_avg_y - b_width + avg_y_stride;
 286   }
 287
 288   {
 289     const int sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
 290     if (abs(sum_diff) > sum_diff_thresh) {
 291       const int delta =
 292           ((abs(sum_diff) - sum_diff_thresh) >> num_pels_log2_lookup[bs]) + 1;
 293       // Only apply the adjustment for max delta up to 3.
 294       if (delta < 4) {
 295         const uint8x16_t k_delta = vdupq_n_u8(delta);
 296         sig -= sig_stride * b_height;
 297         mc_running_avg_y -= mc_avg_y_stride * b_height;
 298         running_avg_y -= avg_y_stride * b_height;
 299         sum_diff = 0;
 300
 301         for (r = 0; r < b_height; ++r) {
 302           for (c = 0; c < b_width_shift4; ++c) {
 303             v_sum_diff_total[c][r >> 4] =
 304                 denoiser_adjust_16x1_neon(sig, mc_running_avg_y, running_avg_y,
 305                                           k_delta, v_sum_diff_total[c][r >> 4]);
 306
 307             // Update pointers for next iteration.
 308             sig += 16;
 309             mc_running_avg_y += 16;
 310             running_avg_y += 16;
 311           }
 312           if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
 313             for (c = 0; c < b_width_shift4; ++c) {
 314               sum_diff += horizontal_add_s8x16(v_sum_diff_total[c][r >> 4]);
 315             }
 316           }
 317
 318           sig = sig - b_width + sig_stride;
 319           mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
 320           running_avg_y = running_avg_y - b_width + avg_y_stride;
 321         }
 322
 323         if (abs(sum_diff) > sum_diff_thresh) {
 324           return COPY_BLOCK;
 325         }
 326       } else {
 327         return COPY_BLOCK;
 328       }
 329     }
 330   }
 331   return FILTER_BLOCK;
 332 }
 333
 334 int vp9_denoiser_filter_neon(const uint8_t *sig, int sig_stride,
 335                              const uint8_t *mc_avg, int mc_avg_stride,
 336                              uint8_t *avg, int avg_stride,
 337                              int increase_denoising, BLOCK_SIZE bs,
 338                              int motion_magnitude) {
 339   // Rank by frequency of the block type to have an early termination.
 340   if (bs == BLOCK_16X16 || bs == BLOCK_32X32 || bs == BLOCK_64X64 ||
 341       bs == BLOCK_16X32 || bs == BLOCK_16X8 || bs == BLOCK_32X16 ||
 342       bs == BLOCK_32X64 || bs == BLOCK_64X32) {
 343     return vp9_denoiser_NxM_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
 344                                  avg_stride, increase_denoising, bs,
 345                                  motion_magnitude);
 346   } else if (bs == BLOCK_8X8 || bs == BLOCK_8X16) {
 347     return vp9_denoiser_8xN_neon(sig, sig_stride, mc_avg, mc_avg_stride, avg,
 348                                  avg_stride, increase_denoising, bs,
 349                                  motion_magnitude, 8);
 350   }
 351   return COPY_BLOCK;
 352 }