quantize: ignore skip_block in x86

author Johann <johannkoenig@google.com>

Mon, 21 Aug 2017 18:15:23 +0000 (11:15 -0700)

committer Johann <johannkoenig@google.com>

Mon, 21 Aug 2017 21:37:03 +0000 (14:37 -0700)
author Johann <johannkoenig@google.com>
Mon, 21 Aug 2017 18:15:23 +0000 (11:15 -0700)
committer Johann <johannkoenig@google.com>
Mon, 21 Aug 2017 21:37:03 +0000 (14:37 -0700)
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c

index 2362476c1f1b2fed66f3c8f0458b156c75b86f0f..cedf98aff4bb9dbfb88983bf3e9cc1a93ec1d027 100644 (file)
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -8,6 +8,7 @@
   *  be found in the AUTHORS file in the root of the source tree.
   */
  
+#include <assert.h>
  #include <emmintrin.h>
  
  #include "vpx_dsp/vpx_dsp_common.h"
@@ -37,54 +38,54 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
    nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
  
    (void)scan;
+  (void)skip_block;
+  assert(!skip_block);
  
    memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
    memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
  
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = ((int)count / 4) - 1; i >= 0; i--) {
-      __m128i coeffs, cmp1, cmp2;
-      int test;
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-      cmp1 = _mm_and_si128(cmp1, cmp2);
-      test = _mm_movemask_epi8(cmp1);
-      if (test == 0xffff)
-        non_zero_regs--;
-      else
-        break;
-    }
+  // Pre-scan pass
+  for (i = ((int)count / 4) - 1; i >= 0; i--) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (test == 0xffff)
+      non_zero_regs--;
+    else
+      break;
+  }
  
-    // Quantization pass:
-    for (i = 0; i < non_zero_regs; i++) {
-      __m128i coeffs, coeffs_sign, tmp1, tmp2;
-      int test;
-      int abs_coeff[4];
-      int coeff_sign[4];
-
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      coeffs_sign = _mm_srai_epi32(coeffs, 31);
-      coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
-      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
-      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
-      tmp1 = _mm_or_si128(tmp1, tmp2);
-      test = _mm_movemask_epi8(tmp1);
-      _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
-      _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
-
-      for (j = 0; j < 4; j++) {
-        if (test & (1 << (4 * j))) {
-          int k = 4 * i + j;
-          const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
-          const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
-          const uint32_t abs_qcoeff =
-              (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
-          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
-          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
-          if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
-        }
+  // Quantization pass:
+  for (i = 0; i < non_zero_regs; i++) {
+    __m128i coeffs, coeffs_sign, tmp1, tmp2;
+    int test;
+    int abs_coeff[4];
+    int coeff_sign[4];
+
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    coeffs_sign = _mm_srai_epi32(coeffs, 31);
+    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+    tmp1 = _mm_or_si128(tmp1, tmp2);
+    test = _mm_movemask_epi8(tmp1);
+    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+    for (j = 0; j < 4; j++) {
+      if (test & (1 << (4 * j))) {
+        int k = 4 * i + j;
+        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
        }
      }
    }
@@ -105,6 +106,9 @@ void vpx_highbd_quantize_b_32x32_sse2(
    const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
    const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
    (void)scan;
+  (void)skip_block;
+  assert(!skip_block);
+
    zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
    zbins[1] = _mm_set1_epi32(zbin1_tmp);
  
@@ -116,38 +120,35 @@ void vpx_highbd_quantize_b_32x32_sse2(
    memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
    memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
  
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs / 4; i++) {
-      __m128i coeffs, cmp1, cmp2;
-      int test;
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-      cmp1 = _mm_and_si128(cmp1, cmp2);
-      test = _mm_movemask_epi8(cmp1);
-      if (!(test & 0xf)) idx_arr[idx++] = i * 4;
-      if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
-      if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
-      if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
-    }
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+  }
  
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = idx_arr[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
-    }
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
    }
    *eob_ptr = eob + 1;
  }
diff --git a/vpx_dsp/x86/quantize_avx_x86_64.asm b/vpx_dsp/x86/quantize_avx_x86_64.asm

index 01c41291bedead58d613154c8d97361d6a0f34b9..6199f7a26038c119366a13b8668249774fc3cd09 100644 (file)
--- a/vpx_dsp/x86/quantize_avx_x86_64.asm
+++ b/vpx_dsp/x86/quantize_avx_x86_64.asm
@@ -19,10 +19,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
  
    vzeroupper
  
-  ; If we can skip this block, then just zero the output
-  cmp                         skipmp, 0
-  jne .blank
-
  %ifnidn %1, b_32x32
  
    ; Special case for ncoeff == 16, as it is frequent and we can save on
@@ -493,48 +489,6 @@ DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
    mov                           [r2], ax
    vzeroupper
    RET
-
-  ; Skip-block, i.e. just write all zeroes
-.blank:
-
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-
-DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
-
-  neg                        ncoeffq
-  pxor                            m7, m7
-
-.blank_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm7
-  mova       [dqcoeffq+ncoeffq*4+32], ymm7
-  mova        [qcoeffq+ncoeffq*4+ 0], ymm7
-  mova        [qcoeffq+ncoeffq*4+32], ymm7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm7
-%endif
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-
-  mov                         [eobq], word 0
-
-  vzeroupper
-  RET
  %endmacro
  
  INIT_XMM avx
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c

index 32721beb3a61eb5ee85f336135e549cbd8423473..257377b86f5e23602abf1a83a4668925f93e1dcb 100644 (file)
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -8,6 +8,7 @@
   *  be found in the AUTHORS file in the root of the source tree.
   */
  
+#include <assert.h>
  #include <emmintrin.h>
  #include <xmmintrin.h>
  
@@ -23,7 +24,12 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           uint16_t *eob_ptr, const int16_t *scan_ptr,
                           const int16_t *iscan_ptr) {
    __m128i zero;
+  __m128i eob;
+  __m128i zbin;
+  __m128i round, quant, dequant, shift;
    (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
  
    coeff_ptr += n_coeffs;
    iscan_ptr += n_coeffs;
@@ -31,193 +37,179 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    dqcoeff_ptr += n_coeffs;
    n_coeffs = -n_coeffs;
    zero = _mm_setzero_si128();
-  if (!skip_block) {
-    __m128i eob;
-    __m128i zbin;
-    __m128i round, quant, dequant, shift;
+  {
+    __m128i coeff0, coeff1;
+
+    // Setup global values
+    {
+      __m128i pw_1;
+      zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+      round = _mm_load_si128((const __m128i *)round_ptr);
+      quant = _mm_load_si128((const __m128i *)quant_ptr);
+      pw_1 = _mm_set1_epi16(1);
+      zbin = _mm_sub_epi16(zbin, pw_1);
+      dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+      shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+    }
+
      {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        __m128i pw_1;
-        zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        pw_1 = _mm_set1_epi16(1);
-        zbin = _mm_sub_epi16(zbin, pw_1);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-        shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-        // Do DC and first 15 AC
-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        shift = _mm_unpackhi_epi64(shift, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
+      __m128i coeff0_sign, coeff1_sign;
+      __m128i qcoeff0, qcoeff1;
+      __m128i qtmp0, qtmp1;
+      __m128i cmp_mask0, cmp_mask1;
+      // Do DC and first 15 AC
+      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+
+      // Poor man's sign extract
+      coeff0_sign = _mm_srai_epi16(coeff0, 15);
+      coeff1_sign = _mm_srai_epi16(coeff1, 15);
+      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+      cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+      zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+      cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      round = _mm_unpackhi_epi64(round, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+      quant = _mm_unpackhi_epi64(quant, quant);
+      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+      qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+      qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+      qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+      shift = _mm_unpackhi_epi64(shift, shift);
+      qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+      // Reinsert signs
+      qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+      qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+      // Mask out zbin threshold coeffs
+      qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+      qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+
+      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      dequant = _mm_unpackhi_epi64(dequant, dequant);
+      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
      }
  
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-
-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
+    {
+      // Scan for eob
+      __m128i zero_coeff0, zero_coeff1;
+      __m128i nzero_coeff0, nzero_coeff1;
+      __m128i iscan0, iscan1;
+      __m128i eob1;
+      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+      // Add one to convert from indices to counts
+      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+      eob = _mm_and_si128(iscan0, nzero_coeff0);
+      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+      eob = _mm_max_epi16(eob, eob1);
      }
+    n_coeffs += 8 * 2;
+  }
  
-    // Accumulate EOB
+  // AC only loop
+  while (n_coeffs < 0) {
+    __m128i coeff0, coeff1;
      {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
+      __m128i coeff0_sign, coeff1_sign;
+      __m128i qcoeff0, qcoeff1;
+      __m128i qtmp0, qtmp1;
+      __m128i cmp_mask0, cmp_mask1;
+
+      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+
+      // Poor man's sign extract
+      coeff0_sign = _mm_srai_epi16(coeff0, 15);
+      coeff1_sign = _mm_srai_epi16(coeff1, 15);
+      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+      cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+      cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+      qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
+      qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
+      qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
+      qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
+
+      // Reinsert signs
+      qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
+      qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+      // Mask out zbin threshold coeffs
+      qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+      qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+
+      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
      }
-  } else {
-    do {
-      store_tran_low(zero, dqcoeff_ptr + n_coeffs);
-      store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);
-      store_tran_low(zero, qcoeff_ptr + n_coeffs);
-      store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
+
+    {
+      // Scan for eob
+      __m128i zero_coeff0, zero_coeff1;
+      __m128i nzero_coeff0, nzero_coeff1;
+      __m128i iscan0, iscan1;
+      __m128i eob0, eob1;
+      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+      // Add one to convert from indices to counts
+      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+      eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+      eob0 = _mm_max_epi16(eob0, eob1);
+      eob = _mm_max_epi16(eob, eob0);
+    }
+    n_coeffs += 8 * 2;
+  }
+
+  // Accumulate EOB
+  {
+    __m128i eob_shuffled;
+    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    *eob_ptr = _mm_extract_epi16(eob, 1);
    }
  }
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c

index 014acb4af5059616512f028ec03503459394c0a7..b074a3e018a61aeb269666c818ba5b8dd2debbd2 100644 (file)
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -8,6 +8,7 @@
   *  be found in the AUTHORS file in the root of the source tree.
   */
  
+#include <assert.h>
  #include <tmmintrin.h>
  
  #include "./vpx_dsp_rtcd.h"
@@ -28,18 +29,8 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    __m128i round, quant, dequant, shift;
    intptr_t index = 0;
    (void)scan_ptr;
-
-  if (skip_block) {
-    do {
-      store_tran_low(zero, dqcoeff_ptr + index);
-      store_tran_low(zero, dqcoeff_ptr + index + 8);
-      store_tran_low(zero, qcoeff_ptr + index);
-      store_tran_low(zero, qcoeff_ptr + index + 8);
-      index += 16;
-    } while (index < n_coeffs);
-    *eob_ptr = 0;
-    return;
-  }
+  (void)skip_block;
+  assert(!skip_block);
  
    // Setup global values
    {
diff --git a/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/vpx_dsp/x86/quantize_ssse3_x86_64.asm

index 19798422b4aee491c058101c6c4bb3a302deeee3..d64b938282ead470232fb9eeff126346478c8d95 100644 (file)
--- a/vpx_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/vpx_dsp/x86/quantize_ssse3_x86_64.asm
@@ -19,9 +19,6 @@ SECTION .text
  cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
                                  shift, qcoeff, dqcoeff, dequant, \
                                  eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
-
    ; actual quantize loop - setup pointers, rounders, etc.
    movifnidn                   coeffq, coeffmp
    movifnidn                  ncoeffq, ncoeffmp
@@ -300,46 +297,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
    pextrw                          r6, m8, 0
    mov                             [r2], r6
    RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       [dqcoeffq+ncoeffq*4+ 0], m7
-  mova       [dqcoeffq+ncoeffq*4+16], m7
-  mova       [dqcoeffq+ncoeffq*4+32], m7
-  mova       [dqcoeffq+ncoeffq*4+48], m7
-  mova        [qcoeffq+ncoeffq*4+ 0], m7
-  mova        [qcoeffq+ncoeffq*4+16], m7
-  mova        [qcoeffq+ncoeffq*4+32], m7
-  mova        [qcoeffq+ncoeffq*4+48], m7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-%endif
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                    word [eobq], 0
-  RET
  %endmacro
  
  INIT_XMM ssse3
author	Johann <johannkoenig@google.com>
	Mon, 21 Aug 2017 18:15:23 +0000 (11:15 -0700)
committer	Johann <johannkoenig@google.com>
	Mon, 21 Aug 2017 21:37:03 +0000 (14:37 -0700)
vpx_dsp/x86/highbd_quantize_intrin_sse2.c		patch \| blob \| history
vpx_dsp/x86/quantize_avx_x86_64.asm		patch \| blob \| history
vpx_dsp/x86/quantize_sse2.c		patch \| blob \| history
vpx_dsp/x86/quantize_ssse3.c		patch \| blob \| history
vpx_dsp/x86/quantize_ssse3_x86_64.asm		patch \| blob \| history