From d468fd90e05ba7f5173d849c63f6a50115c9769b Mon Sep 17 00:00:00 2001
From: Luca Barbato <lu_zero@gentoo.org>
Date: Wed, 6 Jun 2018 21:10:18 +0000
Subject: [PATCH] Implement subtract_block for VSX
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

~2x speedup or better.

[ RUN      ] C/VP9SubtractBlockTest.Speed/0
[    BENCH ]      4x4  365.1 ms ( Â±2.2 ms )
[    BENCH ]      8x4  258.5 ms ( Â±0.3 ms )
[    BENCH ]      4x8  202.7 ms ( Â±0.2 ms )
[    BENCH ]      8x8  162.2 ms ( Â±0.5 ms )
[    BENCH ]     16x8  138.8 ms ( Â±0.3 ms )
[    BENCH ]     8x16  121.5 ms ( Â±0.4 ms )
[    BENCH ]    16x16  110.2 ms ( Â±0.5 ms )
[    BENCH ]    32x16  104.8 ms ( Â±0.1 ms )
[    BENCH ]    16x32  32.7 ms ( Â±0.1 ms )
[    BENCH ]    32x32  30.0 ms ( Â±0.0 ms )
[    BENCH ]    64x32  28.7 ms ( Â±0.0 ms )
[    BENCH ]    32x64  20.1 ms ( Â±0.0 ms )
[    BENCH ]    64x64  19.3 ms ( Â±0.0 ms )

[ RUN      ] VSX/VP9SubtractBlockTest.Speed/0
[    BENCH ]      4x4  155.3 ms ( Â±0.9 ms )
[    BENCH ]      8x4  99.3 ms ( Â±0.4 ms )
[    BENCH ]      4x8  77.2 ms ( Â±0.1 ms )
[    BENCH ]      8x8  45.7 ms ( Â±0.0 ms )
[    BENCH ]     16x8  34.1 ms ( Â±0.0 ms )
[    BENCH ]     8x16  29.5 ms ( Â±0.0 ms )
[    BENCH ]    16x16  19.9 ms ( Â±0.0 ms )
[    BENCH ]    32x16  15.1 ms ( Â±0.0 ms )
[    BENCH ]    16x32  16.7 ms ( Â±0.0 ms )
[    BENCH ]    32x32  14.1 ms ( Â±0.0 ms )
[    BENCH ]    64x32  12.6 ms ( Â±0.0 ms )
[    BENCH ]    32x64  12.0 ms ( Â±0.0 ms )
[    BENCH ]    64x64  11.2 ms ( Â±0.0 ms )

Change-Id: I89ce12b6475871dc9e8fde84d0b6fe5c420c28c7
---
 test/vp9_subtract_test.cc    |   5 ++
 vpx_dsp/ppc/subtract_vsx.c   | 117 +++++++++++++++++++++++++++++++++++
 vpx_dsp/ppc/types_vsx.h      |   7 +++
 vpx_dsp/ppc/variance_vsx.c   |   7 ---
 vpx_dsp/vpx_dsp.mk           |   1 +
 vpx_dsp/vpx_dsp_rtcd_defs.pl |   2 +-
 6 files changed, 131 insertions(+), 8 deletions(-)
 create mode 100644 vpx_dsp/ppc/subtract_vsx.c

diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index a58a67916..88d94f671 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -145,4 +145,9 @@ INSTANTIATE_TEST_CASE_P(MMI, VP9SubtractBlockTest,
                         ::testing::Values(vpx_subtract_block_mmi));
 #endif
 
+#if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(VSX, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_vsx));
+#endif
+
 }  // namespace vp9
diff --git a/vpx_dsp/ppc/subtract_vsx.c b/vpx_dsp/ppc/subtract_vsx.c
new file mode 100644
index 000000000..3fd4a6a2d
--- /dev/null
+++ b/vpx_dsp/ppc/subtract_vsx.c
@@ -0,0 +1,117 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+static VPX_FORCE_INLINE void subtract_block4x4(
+    int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src,
+    ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) {
+  int16_t *diff1 = diff + 2 * diff_stride;
+  const uint8_t *src1 = src + 2 * src_stride;
+  const uint8_t *pred1 = pred + 2 * pred_stride;
+
+  const int16x8_t d0 = vec_vsx_ld(0, diff);
+  const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride);
+  const int16x8_t d2 = vec_vsx_ld(0, diff1);
+  const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride);
+
+  const uint8x16_t s0 = read4x2(src, (int)src_stride);
+  const uint8x16_t p0 = read4x2(pred, (int)pred_stride);
+  const uint8x16_t s1 = read4x2(src1, (int)src_stride);
+  const uint8x16_t p1 = read4x2(pred1, (int)pred_stride);
+
+  const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+  const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
+
+  vec_vsx_st(xxpermdi(da, d0, 1), 0, diff);
+  vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride);
+  vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1);
+  vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride);
+}
+
+void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff,
+                            ptrdiff_t diff_stride, const uint8_t *src,
+                            ptrdiff_t src_stride, const uint8_t *pred,
+                            ptrdiff_t pred_stride) {
+  int r = rows, c;
+
+  switch (cols) {
+    case 64:
+    case 32:
+      do {
+        for (c = 0; c < cols; c += 32) {
+          const uint8x16_t s0 = vec_vsx_ld(0, src + c);
+          const uint8x16_t s1 = vec_vsx_ld(16, src + c);
+          const uint8x16_t p0 = vec_vsx_ld(0, pred + c);
+          const uint8x16_t p1 = vec_vsx_ld(16, pred + c);
+          const int16x8_t d0l =
+              vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
+          const int16x8_t d0h =
+              vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+          const int16x8_t d1l =
+              vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1));
+          const int16x8_t d1h =
+              vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
+          vec_vsx_st(d0h, 0, diff + c);
+          vec_vsx_st(d0l, 16, diff + c);
+          vec_vsx_st(d1h, 0, diff + c + 16);
+          vec_vsx_st(d1l, 16, diff + c + 16);
+        }
+        diff += diff_stride;
+        pred += pred_stride;
+        src += src_stride;
+      } while (--r);
+      break;
+    case 16:
+      do {
+        const uint8x16_t s0 = vec_vsx_ld(0, src);
+        const uint8x16_t p0 = vec_vsx_ld(0, pred);
+        const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
+        const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+        vec_vsx_st(d0h, 0, diff);
+        vec_vsx_st(d0l, 16, diff);
+        diff += diff_stride;
+        pred += pred_stride;
+        src += src_stride;
+      } while (--r);
+      break;
+    case 8:
+      do {
+        const uint8x16_t s0 = vec_vsx_ld(0, src);
+        const uint8x16_t p0 = vec_vsx_ld(0, pred);
+        const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
+        vec_vsx_st(d0h, 0, diff);
+        diff += diff_stride;
+        pred += pred_stride;
+        src += src_stride;
+      } while (--r);
+      break;
+    case 4:
+      subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride);
+      if (r > 4) {
+        diff += 4 * diff_stride;
+        pred += 4 * pred_stride;
+        src += 4 * src_stride;
+
+        subtract_block4x4(diff, diff_stride,
+
+                          src, src_stride,
+
+                          pred, pred_stride);
+      }
+      break;
+    default:
+      assert(0);  // unreachable
+  }
+}
diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h
index c6c7ce9f1..803d0377a 100644
--- a/vpx_dsp/ppc/types_vsx.h
+++ b/vpx_dsp/ppc/types_vsx.h
@@ -68,6 +68,13 @@ static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
 #endif
 #endif
 
+static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) {
+  const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);
+  const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);
+
+  return (uint8x16_t)vec_mergeh(a0, a1);
+}
+
 static const uint8x16_t vec_zeros_u8 = { 0, 0, 0, 0, 0, 0, 0, 0,
                                          0, 0, 0, 0, 0, 0, 0, 0 };
 static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 };
diff --git a/vpx_dsp/ppc/variance_vsx.c b/vpx_dsp/ppc/variance_vsx.c
index d3f257b63..50311d1b0 100644
--- a/vpx_dsp/ppc/variance_vsx.c
+++ b/vpx_dsp/ppc/variance_vsx.c
@@ -14,13 +14,6 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/ppc/types_vsx.h"
 
-static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) {
-  const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);
-  const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);
-
-  return (uint8x16_t)vec_mergeh(a0, a1);
-}
-
 uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b,
                               int b_stride) {
   int distortion;
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 573d6fef1..c12dab736 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -342,6 +342,7 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
 
 DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c
+DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 9661f3bd8..f237e5503 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -718,7 +718,7 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 # Block subtraction
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa mmi sse2/;
+specialize qw/vpx_subtract_block neon msa mmi sse2 vsx/;
 
 #
 # Single block SAD
-- 
2.49.0