Faster VSX vpx_quantize_b

author Luc Trudeau <luc@trud.ca>

Thu, 10 May 2018 20:54:13 +0000 (16:54 -0400)

committer Luc Trudeau <luc@trud.ca>

Fri, 11 May 2018 01:23:39 +0000 (21:23 -0400)
author Luc Trudeau <luc@trud.ca>
Thu, 10 May 2018 20:54:13 +0000 (16:54 -0400)
committer Luc Trudeau <luc@trud.ca>
Fri, 11 May 2018 01:23:39 +0000 (21:23 -0400)
diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c

index e037f89e36b76eddd1261ab5f3f403ca875ef655..361f9e3d341f45d647ca18ffc583611e75cd7288 100644 (file)
--- a/vpx_dsp/ppc/quantize_vsx.c
+++ b/vpx_dsp/ppc/quantize_vsx.c
@@ -41,10 +41,9 @@ static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs,
  }
  
  static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
-                                          const int16_t *iscan_ptr) {
-  bool16x8_t zero_coeff;
-  int16x8_t scan = vec_vsx_ld(0, iscan_ptr);
-  zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
+                                          const int16_t *iscan_ptr, int index) {
+  int16x8_t scan = vec_vsx_ld(index, iscan_ptr);
+  bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
    scan = vec_sub(scan, mask);
    return vec_andc(scan, zero_coeff);
  }
@@ -64,7 +63,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan_ptr,
                          const int16_t *iscan_ptr) {
-  int16x8_t qcoeff, dqcoeff, eob;
+  int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+  bool16x8_t zero_mask0, zero_mask1;
  
    // First set of 8 coeff starts with DC + 7 AC
    int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
@@ -73,51 +73,86 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
    int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
  
-  int16x8_t coeff = vec_vsx_ld(0, coeff_ptr);
-  int16x8_t coeff_abs = vec_abs(coeff);
-  bool16x8_t zero_mask = vec_cmpge(coeff_abs, zbin);
+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+
+  int16x8_t coeff0_abs = vec_abs(coeff0);
+  int16x8_t coeff1_abs = vec_abs(coeff1);
+
+  zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+  zbin = vec_splat(zbin, 1);
+  zero_mask1 = vec_cmpge(coeff1_abs, zbin);
  
    (void)scan_ptr;
    (void)skip_block;
    assert(!skip_block);
  
-  qcoeff =
-      quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);
-  vec_vsx_st(qcoeff, 0, qcoeff_ptr);
-
-  dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);
-  vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);
-
-  eob = nonzero_scanindex(qcoeff, zero_mask, iscan_ptr);
-
-  // All other sets of 8 coeffs will only contain AC
-  zbin = vec_splat(zbin, 1);
+  qcoeff0 =
+      quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0);
+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
    round = vec_splat(round, 1);
    quant = vec_splat(quant, 1);
-  dequant = vec_splat(dequant, 1);
    quant_shift = vec_splat(quant_shift, 1);
+  qcoeff1 =
+      quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1);
+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
  
-  n_coeffs -= 8;
-  do {
-    coeff_ptr += 8;
-    qcoeff_ptr += 8;
-    dqcoeff_ptr += 8;
-    iscan_ptr += 8;
-
-    coeff = vec_vsx_ld(0, coeff_ptr);
-    coeff_abs = vec_abs(coeff);
-    zero_mask = vec_cmpge(coeff_abs, zbin);
-    qcoeff =
-        quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);
-    vec_vsx_st(qcoeff, 0, qcoeff_ptr);
-
-    dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);
-    vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);
-
-    eob = vec_max(eob, nonzero_scanindex(qcoeff, zero_mask, iscan_ptr));
-
-    n_coeffs -= 8;
-  } while (n_coeffs > 0);
+  dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+  dequant = vec_splat(dequant, 1);
+  dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
+
+  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+
+  if (n_coeffs > 16) {
+    int index = 16;
+    int off0 = 32;
+    int off1 = 48;
+    int off2 = 64;
+    do {
+      int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2;
+      bool16x8_t zero_mask2;
+      coeff0 = vec_vsx_ld(off0, coeff_ptr);
+      coeff1 = vec_vsx_ld(off1, coeff_ptr);
+      coeff2 = vec_vsx_ld(off2, coeff_ptr);
+      coeff0_abs = vec_abs(coeff0);
+      coeff1_abs = vec_abs(coeff1);
+      coeff2_abs = vec_abs(coeff2);
+      zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+      zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+      zero_mask2 = vec_cmpge(coeff2_abs, zbin);
+      qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift,
+                               zero_mask0);
+      qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift,
+                               zero_mask1);
+      qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift,
+                               zero_mask2);
+      vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+      vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+      vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+
+      dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+      dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+      dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16);
+
+      vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+      vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+      vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+      eob =
+          vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
+      eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
+                     nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+      eob = vec_max(eob, eob2);
+
+      index += 24;
+      off0 += 48;
+      off1 += 48;
+      off2 += 48;
+    } while (index < n_coeffs);
+  }
  
    eob = vec_max_across(eob);
    *eob_ptr = eob[0];
author	Luc Trudeau <luc@trud.ca>
	Thu, 10 May 2018 20:54:13 +0000 (16:54 -0400)
committer	Luc Trudeau <luc@trud.ca>
	Fri, 11 May 2018 01:23:39 +0000 (21:23 -0400)