From 81a98509dcd44e939656d8c281e5a5ae5b9926be Mon Sep 17 00:00:00 2001
From: Luc Trudeau <luc@trud.ca>
Date: Thu, 10 May 2018 16:54:13 -0400
Subject: [PATCH] Faster VSX vpx_quantize_b

Process 16 coefficients on the first iteration (a full 4x4) and 24 coefficients
on subsequent iteration.

VSX/VP9QuantizeTest.DISABLED_Speed
Before:
4x4   176 ms
8x8    91 ms
16x16  72 ms
After:
4x4   152 ms
8x8    82 ms
16x16  64 ms

Change-Id: I07cb130833504206ccdc5bc12ae5af369364999a
---
 vpx_dsp/ppc/quantize_vsx.c | 117 ++++++++++++++++++++++++-------------
 1 file changed, 76 insertions(+), 41 deletions(-)

diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c
index e037f89e3..361f9e3d3 100644
--- a/vpx_dsp/ppc/quantize_vsx.c
+++ b/vpx_dsp/ppc/quantize_vsx.c
@@ -41,10 +41,9 @@ static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs,
 }
 
 static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
-                                          const int16_t *iscan_ptr) {
-  bool16x8_t zero_coeff;
-  int16x8_t scan = vec_vsx_ld(0, iscan_ptr);
-  zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
+                                          const int16_t *iscan_ptr, int index) {
+  int16x8_t scan = vec_vsx_ld(index, iscan_ptr);
+  bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
   scan = vec_sub(scan, mask);
   return vec_andc(scan, zero_coeff);
 }
@@ -64,7 +63,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                         uint16_t *eob_ptr, const int16_t *scan_ptr,
                         const int16_t *iscan_ptr) {
-  int16x8_t qcoeff, dqcoeff, eob;
+  int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
+  bool16x8_t zero_mask0, zero_mask1;
 
   // First set of 8 coeff starts with DC + 7 AC
   int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
@@ -73,51 +73,86 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
   int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
 
-  int16x8_t coeff = vec_vsx_ld(0, coeff_ptr);
-  int16x8_t coeff_abs = vec_abs(coeff);
-  bool16x8_t zero_mask = vec_cmpge(coeff_abs, zbin);
+  int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr);
+  int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr);
+
+  int16x8_t coeff0_abs = vec_abs(coeff0);
+  int16x8_t coeff1_abs = vec_abs(coeff1);
+
+  zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+  zbin = vec_splat(zbin, 1);
+  zero_mask1 = vec_cmpge(coeff1_abs, zbin);
 
   (void)scan_ptr;
   (void)skip_block;
   assert(!skip_block);
 
-  qcoeff =
-      quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);
-  vec_vsx_st(qcoeff, 0, qcoeff_ptr);
-
-  dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);
-  vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);
-
-  eob = nonzero_scanindex(qcoeff, zero_mask, iscan_ptr);
-
-  // All other sets of 8 coeffs will only contain AC
-  zbin = vec_splat(zbin, 1);
+  qcoeff0 =
+      quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0);
+  vec_vsx_st(qcoeff0, 0, qcoeff_ptr);
   round = vec_splat(round, 1);
   quant = vec_splat(quant, 1);
-  dequant = vec_splat(dequant, 1);
   quant_shift = vec_splat(quant_shift, 1);
+  qcoeff1 =
+      quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift, zero_mask1);
+  vec_vsx_st(qcoeff1, 16, qcoeff_ptr);
 
-  n_coeffs -= 8;
-  do {
-    coeff_ptr += 8;
-    qcoeff_ptr += 8;
-    dqcoeff_ptr += 8;
-    iscan_ptr += 8;
-
-    coeff = vec_vsx_ld(0, coeff_ptr);
-    coeff_abs = vec_abs(coeff);
-    zero_mask = vec_cmpge(coeff_abs, zbin);
-    qcoeff =
-        quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);
-    vec_vsx_st(qcoeff, 0, qcoeff_ptr);
-
-    dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);
-    vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);
-
-    eob = vec_max(eob, nonzero_scanindex(qcoeff, zero_mask, iscan_ptr));
-
-    n_coeffs -= 8;
-  } while (n_coeffs > 0);
+  dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff0, 0, dqcoeff_ptr);
+  dequant = vec_splat(dequant, 1);
+  dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+  vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
+
+  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+
+  if (n_coeffs > 16) {
+    int index = 16;
+    int off0 = 32;
+    int off1 = 48;
+    int off2 = 64;
+    do {
+      int16x8_t coeff2, coeff2_abs, qcoeff2, dqcoeff2, eob2;
+      bool16x8_t zero_mask2;
+      coeff0 = vec_vsx_ld(off0, coeff_ptr);
+      coeff1 = vec_vsx_ld(off1, coeff_ptr);
+      coeff2 = vec_vsx_ld(off2, coeff_ptr);
+      coeff0_abs = vec_abs(coeff0);
+      coeff1_abs = vec_abs(coeff1);
+      coeff2_abs = vec_abs(coeff2);
+      zero_mask0 = vec_cmpge(coeff0_abs, zbin);
+      zero_mask1 = vec_cmpge(coeff1_abs, zbin);
+      zero_mask2 = vec_cmpge(coeff2_abs, zbin);
+      qcoeff0 = quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift,
+                               zero_mask0);
+      qcoeff1 = quantize_coeff(coeff1, coeff1_abs, round, quant, quant_shift,
+                               zero_mask1);
+      qcoeff2 = quantize_coeff(coeff2, coeff2_abs, round, quant, quant_shift,
+                               zero_mask2);
+      vec_vsx_st(qcoeff0, off0, qcoeff_ptr);
+      vec_vsx_st(qcoeff1, off1, qcoeff_ptr);
+      vec_vsx_st(qcoeff2, off2, qcoeff_ptr);
+
+      dqcoeff0 = vec_mladd(qcoeff0, dequant, vec_zeros_s16);
+      dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
+      dqcoeff2 = vec_mladd(qcoeff2, dequant, vec_zeros_s16);
+
+      vec_vsx_st(dqcoeff0, off0, dqcoeff_ptr);
+      vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
+      vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
+
+      eob =
+          vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
+      eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
+                     nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+      eob = vec_max(eob, eob2);
+
+      index += 24;
+      off0 += 48;
+      off1 += 48;
+      off2 += 48;
+    } while (index < n_coeffs);
+  }
 
   eob = vec_max_across(eob);
   *eob_ptr = eob[0];
-- 
2.40.0