granicus.if.org Git - libvpx/blob - vpx_dsp/x86/quantize_ssse3.c

   1 /*
   2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
   3  *
   4  *  Use of this source code is governed by a BSD-style license
   5  *  that can be found in the LICENSE file in the root of the source
   6  *  tree. An additional intellectual property rights grant can be found
   7  *  in the file PATENTS.  All contributing project authors may
   8  *  be found in the AUTHORS file in the root of the source tree.
   9  */
  10
  11 #include <assert.h>
  12 #include <tmmintrin.h>
  13
  14 #include "./vpx_dsp_rtcd.h"
  15 #include "vpx/vpx_integer.h"
  16 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
  17 #include "vpx_dsp/x86/quantize_x86.h"
  18
  19 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
  20                           int skip_block, const int16_t *zbin_ptr,
  21                           const int16_t *round_ptr, const int16_t *quant_ptr,
  22                           const int16_t *quant_shift_ptr,
  23                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
  24                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
  25                           const int16_t *scan_ptr, const int16_t *iscan_ptr) {
  26   const __m128i zero = _mm_setzero_si128();
  27   int index = 16;
  28
  29   __m128i zbin, round, quant, dequant, shift;
  30   __m128i coeff0, coeff1;
  31   __m128i qcoeff0, qcoeff1;
  32   __m128i cmp_mask0, cmp_mask1;
  33   __m128i eob, eob0;
  34
  35   (void)scan_ptr;
  36   (void)skip_block;
  37   assert(!skip_block);
  38
  39   load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
  40                 dequant_ptr, &dequant, quant_shift_ptr, &shift);
  41
  42   // Do DC and first 15 AC.
  43   coeff0 = load_tran_low(coeff_ptr);
  44   coeff1 = load_tran_low(coeff_ptr + 8);
  45
  46   qcoeff0 = _mm_abs_epi16(coeff0);
  47   qcoeff1 = _mm_abs_epi16(coeff1);
  48
  49   cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
  50   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
  51   cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
  52
  53   calculate_qcoeff(&qcoeff0, round, quant, shift);
  54   round = _mm_unpackhi_epi64(round, round);
  55   quant = _mm_unpackhi_epi64(quant, quant);
  56   shift = _mm_unpackhi_epi64(shift, shift);
  57   calculate_qcoeff(&qcoeff1, round, quant, shift);
  58
  59   // Reinsert signs
  60   qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
  61   qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
  62
  63   // Mask out zbin threshold coeffs
  64   qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
  65   qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
  66
  67   store_tran_low(qcoeff0, qcoeff_ptr);
  68   store_tran_low(qcoeff1, qcoeff_ptr + 8);
  69
  70   coeff0 = calculate_dqcoeff(qcoeff0, dequant);
  71   dequant = _mm_unpackhi_epi64(dequant, dequant);
  72   coeff1 = calculate_dqcoeff(qcoeff1, dequant);
  73
  74   store_tran_low(coeff0, dqcoeff_ptr);
  75   store_tran_low(coeff1, dqcoeff_ptr + 8);
  76
  77   eob =
  78       scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
  79
  80   // AC only loop.
  81   while (index < n_coeffs) {
  82     coeff0 = load_tran_low(coeff_ptr + index);
  83     coeff1 = load_tran_low(coeff_ptr + index + 8);
  84
  85     qcoeff0 = _mm_abs_epi16(coeff0);
  86     qcoeff1 = _mm_abs_epi16(coeff1);
  87
  88     cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
  89     cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
  90
  91     calculate_qcoeff(&qcoeff0, round, quant, shift);
  92     calculate_qcoeff(&qcoeff1, round, quant, shift);
  93
  94     qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
  95     qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
  96
  97     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
  98     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
  99
 100     store_tran_low(qcoeff0, qcoeff_ptr + index);
 101     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
 102
 103     coeff0 = calculate_dqcoeff(qcoeff0, dequant);
 104     coeff1 = calculate_dqcoeff(qcoeff1, dequant);
 105
 106     store_tran_low(coeff0, dqcoeff_ptr + index);
 107     store_tran_low(coeff1, dqcoeff_ptr + index + 8);
 108
 109     eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
 110                         index, zero);
 111     eob = _mm_max_epi16(eob, eob0);
 112
 113     index += 16;
 114   }
 115
 116   *eob_ptr = accumulate_eob(eob);
 117 }
 118
 119 void vpx_quantize_b_32x32_ssse3(
 120     const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
 121     const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
 122     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
 123     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
 124     const int16_t *scan_ptr, const int16_t *iscan_ptr) {
 125   const __m128i zero = _mm_setzero_si128();
 126   const __m128i one = _mm_set1_epi16(1);
 127   int index;
 128
 129   __m128i zbin, round, quant, dequant, shift;
 130   __m128i coeff0, coeff1;
 131   __m128i qcoeff0, qcoeff1;
 132   __m128i cmp_mask0, cmp_mask1;
 133   __m128i all_zero;
 134   __m128i eob = zero, eob0;
 135
 136   (void)scan_ptr;
 137   (void)n_coeffs;
 138   (void)skip_block;
 139   assert(!skip_block);
 140
 141   // Setup global values.
 142   // The 32x32 halves zbin and round.
 143   zbin = _mm_load_si128((const __m128i *)zbin_ptr);
 144   // Shift with rounding.
 145   zbin = _mm_add_epi16(zbin, one);
 146   zbin = _mm_srli_epi16(zbin, 1);
 147   // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
 148   // it is a strict "greater" comparison.
 149   zbin = _mm_sub_epi16(zbin, one);
 150
 151   round = _mm_load_si128((const __m128i *)round_ptr);
 152   round = _mm_add_epi16(round, one);
 153   round = _mm_srli_epi16(round, 1);
 154
 155   quant = _mm_load_si128((const __m128i *)quant_ptr);
 156   dequant = _mm_load_si128((const __m128i *)dequant_ptr);
 157   shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
 158   // I suspect this is not technically OK because quant_shift can be up
 159   // to 1 << 16 and shifting up again will outrange that, but the test is not
 160   // comprehensive enough to catch that and "it's been that way forever"
 161   shift = _mm_slli_epi16(shift, 1);
 162
 163   // Do DC and first 15 AC.
 164   coeff0 = load_tran_low(coeff_ptr);
 165   coeff1 = load_tran_low(coeff_ptr + 8);
 166
 167   qcoeff0 = _mm_abs_epi16(coeff0);
 168   qcoeff1 = _mm_abs_epi16(coeff1);
 169
 170   cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
 171   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
 172   cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
 173
 174   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
 175   if (_mm_movemask_epi8(all_zero) == 0) {
 176     _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
 177     _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
 178     _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
 179     _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
 180 #if CONFIG_VP9_HIGHBITDEPTH
 181     _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
 182     _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
 183     _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
 184     _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
 185 #endif  // CONFIG_HIGHBITDEPTH
 186
 187     round = _mm_unpackhi_epi64(round, round);
 188     quant = _mm_unpackhi_epi64(quant, quant);
 189     shift = _mm_unpackhi_epi64(shift, shift);
 190     dequant = _mm_unpackhi_epi64(dequant, dequant);
 191   } else {
 192     calculate_qcoeff(&qcoeff0, round, quant, shift);
 193     round = _mm_unpackhi_epi64(round, round);
 194     quant = _mm_unpackhi_epi64(quant, quant);
 195     shift = _mm_unpackhi_epi64(shift, shift);
 196     calculate_qcoeff(&qcoeff1, round, quant, shift);
 197
 198     // Reinsert signs.
 199     qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
 200     qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
 201
 202     // Mask out zbin threshold coeffs.
 203     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
 204     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 205
 206     store_tran_low(qcoeff0, qcoeff_ptr);
 207     store_tran_low(qcoeff1, qcoeff_ptr + 8);
 208
 209     // Un-sign to bias rounding like C.
 210     // dequant is almost always negative, so this is probably the backwards way
 211     // to handle the sign. However, it matches the previous assembly.
 212     coeff0 = _mm_abs_epi16(qcoeff0);
 213     coeff1 = _mm_abs_epi16(qcoeff1);
 214
 215     coeff0 = calculate_dqcoeff(coeff0, dequant);
 216     dequant = _mm_unpackhi_epi64(dequant, dequant);
 217     coeff1 = calculate_dqcoeff(coeff1, dequant);
 218
 219     // "Divide" by 2.
 220     coeff0 = _mm_srli_epi16(coeff0, 1);
 221     coeff1 = _mm_srli_epi16(coeff1, 1);
 222
 223     coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
 224     coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
 225
 226     store_tran_low(coeff0, dqcoeff_ptr);
 227     store_tran_low(coeff1, dqcoeff_ptr + 8);
 228
 229     eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
 230                        zero);
 231   }
 232
 233   // AC only loop.
 234   for (index = 16; index < 32 * 32; index += 16) {
 235     coeff0 = load_tran_low(coeff_ptr + index);
 236     coeff1 = load_tran_low(coeff_ptr + index + 8);
 237
 238     qcoeff0 = _mm_abs_epi16(coeff0);
 239     qcoeff1 = _mm_abs_epi16(coeff1);
 240
 241     cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
 242     cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
 243
 244     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
 245     if (_mm_movemask_epi8(all_zero) == 0) {
 246       _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
 247       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
 248       _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
 249       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
 250 #if CONFIG_VP9_HIGHBITDEPTH
 251       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
 252       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
 253       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
 254       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
 255 #endif  // CONFIG_VP9_HIGHBITDEPTH
 256       continue;
 257     }
 258
 259     calculate_qcoeff(&qcoeff0, round, quant, shift);
 260     calculate_qcoeff(&qcoeff1, round, quant, shift);
 261
 262     qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
 263     qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
 264
 265     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
 266     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
 267
 268     store_tran_low(qcoeff0, qcoeff_ptr + index);
 269     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
 270
 271     coeff0 = _mm_abs_epi16(qcoeff0);
 272     coeff1 = _mm_abs_epi16(qcoeff1);
 273
 274     coeff0 = calculate_dqcoeff(coeff0, dequant);
 275     coeff1 = calculate_dqcoeff(coeff1, dequant);
 276
 277     coeff0 = _mm_srli_epi16(coeff0, 1);
 278     coeff1 = _mm_srli_epi16(coeff1, 1);
 279
 280     coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
 281     coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
 282
 283     store_tran_low(coeff0, dqcoeff_ptr + index);
 284     store_tran_low(coeff1, dqcoeff_ptr + index + 8);
 285
 286     eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
 287                         index, zero);
 288     eob = _mm_max_epi16(eob, eob0);
 289   }
 290
 291   *eob_ptr = accumulate_eob(eob);
 292 }