quantize: use aarch64 vmaxv

author Johann <johann.koenig@duck.com>

Mon, 12 Nov 2018 19:30:03 +0000 (11:30 -0800)

committer Johann <johann.koenig@duck.com>

Mon, 12 Nov 2018 19:47:29 +0000 (11:47 -0800)
author Johann <johann.koenig@duck.com>
Mon, 12 Nov 2018 19:30:03 +0000 (11:30 -0800)
committer Johann <johann.koenig@duck.com>
Mon, 12 Nov 2018 19:47:29 +0000 (11:47 -0800)
diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c

index c42005df6c81b251976325fcd0ff8cbd30f4e4de..d066be1a7a1aa6ac87c1083740e1d796642d7446 100644 (file)
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.c
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -26,9 +26,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
                     zig_zag1 = vld1q_u16(inv_zig_zag + 8);
    int16x8_t x0, x1, sz0, sz1, y0, y1;
    uint16x8_t eob0, eob1;
+#ifndef __aarch64__
    uint16x4_t eob_d16;
    uint32x2_t eob_d32;
    uint32x4_t eob_q32;
+#endif  // __arch64__
  
    /* sign of z: z >> 15 */
    sz0 = vshrq_n_s16(z0, 15);
@@ -66,11 +68,17 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
  
    /* select the largest value */
    eob0 = vmaxq_u16(eob0, eob1);
+#ifdef __aarch64__
+  *d->eob = (int8_t)vmaxvq_u16(eob0);
+#else
    eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
    eob_q32 = vmovl_u16(eob_d16);
    eob_d32 = vmax_u32(vget_low_u32(eob_q32), vget_high_u32(eob_q32));
    eob_d32 = vpmax_u32(eob_d32, eob_d32);
  
+  vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
+#endif  // __aarch64__
+
    /* qcoeff = x */
    vst1q_s16(d->qcoeff, x0);
    vst1q_s16(d->qcoeff + 8, x1);
@@ -78,6 +86,4 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
    /* dqcoeff = x * dequant */
    vst1q_s16(d->dqcoeff, vmulq_s16(dequant0, x0));
    vst1q_s16(d->dqcoeff + 8, vmulq_s16(dequant1, x1));
-
-  vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
  }
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c

index 2cec8bd030fa78237e2745cda8a2e0aa050595cd..8b62b450cef2f0476fd99081e34a086e71891caa 100644 (file)
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -97,6 +97,9 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
      store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
      store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
    }
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
+#else
    {
      const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
                                               vget_high_s16(v_eobmax_76543210));
@@ -111,6 +114,7 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
  
      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
    }
+#endif  // __aarch64__
  }
  
  static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -226,6 +230,9 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
        dqcoeff_ptr += 8;
      }
  
+#ifdef __aarch64__
+    *eob_ptr = vmaxvq_u16(eob_max);
+#else
      {
        const uint16x4_t eob_max_0 =
            vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
@@ -233,5 +240,6 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
        const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
        vst1_lane_u16(eob_ptr, eob_max_2, 0);
      }
+#endif  // __aarch64__
    }
  }
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c

index 1e338516f63a8d8767e9c3efe179dedcf0921465..b5d1e7ecb58ea9d5b32e5ea86e16bfe117118d4a 100644 (file)
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -135,6 +135,9 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      } while (n_coeffs > 0);
    }
  
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
    {
      const uint16x4_t eob_max_0 =
          vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
@@ -142,6 +145,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
      vst1_lane_u16(eob_ptr, eob_max_2, 0);
    }
+#endif  // __aarch64__
  }
  
  static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -288,6 +292,9 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      }
    }
  
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
    {
      const uint16x4_t eob_max_0 =
          vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
@@ -295,4 +302,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
      vst1_lane_u16(eob_ptr, eob_max_2, 0);
    }
+#endif  // __aarch64__
  }
author	Johann <johann.koenig@duck.com>
	Mon, 12 Nov 2018 19:30:03 +0000 (11:30 -0800)
committer	Johann <johann.koenig@duck.com>
	Mon, 12 Nov 2018 19:47:29 +0000 (11:47 -0800)
vp8/encoder/arm/neon/fastquantizeb_neon.c		patch \| blob \| history
vp9/encoder/arm/neon/vp9_quantize_neon.c		patch \| blob \| history
vpx_dsp/arm/quantize_neon.c		patch \| blob \| history