From d468fd90e05ba7f5173d849c63f6a50115c9769b Mon Sep 17 00:00:00 2001 From: Luca Barbato Date: Wed, 6 Jun 2018 21:10:18 +0000 Subject: [PATCH] Implement subtract_block for VSX MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit ~2x speedup or better. [ RUN ] C/VP9SubtractBlockTest.Speed/0 [ BENCH ] 4x4 365.1 ms ( ±2.2 ms ) [ BENCH ] 8x4 258.5 ms ( ±0.3 ms ) [ BENCH ] 4x8 202.7 ms ( ±0.2 ms ) [ BENCH ] 8x8 162.2 ms ( ±0.5 ms ) [ BENCH ] 16x8 138.8 ms ( ±0.3 ms ) [ BENCH ] 8x16 121.5 ms ( ±0.4 ms ) [ BENCH ] 16x16 110.2 ms ( ±0.5 ms ) [ BENCH ] 32x16 104.8 ms ( ±0.1 ms ) [ BENCH ] 16x32 32.7 ms ( ±0.1 ms ) [ BENCH ] 32x32 30.0 ms ( ±0.0 ms ) [ BENCH ] 64x32 28.7 ms ( ±0.0 ms ) [ BENCH ] 32x64 20.1 ms ( ±0.0 ms ) [ BENCH ] 64x64 19.3 ms ( ±0.0 ms ) [ RUN ] VSX/VP9SubtractBlockTest.Speed/0 [ BENCH ] 4x4 155.3 ms ( ±0.9 ms ) [ BENCH ] 8x4 99.3 ms ( ±0.4 ms ) [ BENCH ] 4x8 77.2 ms ( ±0.1 ms ) [ BENCH ] 8x8 45.7 ms ( ±0.0 ms ) [ BENCH ] 16x8 34.1 ms ( ±0.0 ms ) [ BENCH ] 8x16 29.5 ms ( ±0.0 ms ) [ BENCH ] 16x16 19.9 ms ( ±0.0 ms ) [ BENCH ] 32x16 15.1 ms ( ±0.0 ms ) [ BENCH ] 16x32 16.7 ms ( ±0.0 ms ) [ BENCH ] 32x32 14.1 ms ( ±0.0 ms ) [ BENCH ] 64x32 12.6 ms ( ±0.0 ms ) [ BENCH ] 32x64 12.0 ms ( ±0.0 ms ) [ BENCH ] 64x64 11.2 ms ( ±0.0 ms ) Change-Id: I89ce12b6475871dc9e8fde84d0b6fe5c420c28c7 --- test/vp9_subtract_test.cc | 5 ++ vpx_dsp/ppc/subtract_vsx.c | 117 +++++++++++++++++++++++++++++++++++ vpx_dsp/ppc/types_vsx.h | 7 +++ vpx_dsp/ppc/variance_vsx.c | 7 --- vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 2 +- 6 files changed, 131 insertions(+), 8 deletions(-) create mode 100644 vpx_dsp/ppc/subtract_vsx.c diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc index a58a67916..88d94f671 100644 --- a/test/vp9_subtract_test.cc +++ b/test/vp9_subtract_test.cc @@ -145,4 +145,9 @@ INSTANTIATE_TEST_CASE_P(MMI, VP9SubtractBlockTest, ::testing::Values(vpx_subtract_block_mmi)); #endif +#if HAVE_VSX +INSTANTIATE_TEST_CASE_P(VSX, VP9SubtractBlockTest, + ::testing::Values(vpx_subtract_block_vsx)); +#endif + } // namespace vp9 diff --git a/vpx_dsp/ppc/subtract_vsx.c b/vpx_dsp/ppc/subtract_vsx.c new file mode 100644 index 000000000..3fd4a6a2d --- /dev/null +++ b/vpx_dsp/ppc/subtract_vsx.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/ppc/types_vsx.h" + +static VPX_FORCE_INLINE void subtract_block4x4( + int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { + int16_t *diff1 = diff + 2 * diff_stride; + const uint8_t *src1 = src + 2 * src_stride; + const uint8_t *pred1 = pred + 2 * pred_stride; + + const int16x8_t d0 = vec_vsx_ld(0, diff); + const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride); + const int16x8_t d2 = vec_vsx_ld(0, diff1); + const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride); + + const uint8x16_t s0 = read4x2(src, (int)src_stride); + const uint8x16_t p0 = read4x2(pred, (int)pred_stride); + const uint8x16_t s1 = read4x2(src1, (int)src_stride); + const uint8x16_t p1 = read4x2(pred1, (int)pred_stride); + + const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1)); + + vec_vsx_st(xxpermdi(da, d0, 1), 0, diff); + vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride); + vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1); + vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride); +} + +void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r = rows, c; + + switch (cols) { + case 64: + case 32: + do { + for (c = 0; c < cols; c += 32) { + const uint8x16_t s0 = vec_vsx_ld(0, src + c); + const uint8x16_t s1 = vec_vsx_ld(16, src + c); + const uint8x16_t p0 = vec_vsx_ld(0, pred + c); + const uint8x16_t p1 = vec_vsx_ld(16, pred + c); + const int16x8_t d0l = + vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0)); + const int16x8_t d0h = + vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + const int16x8_t d1l = + vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1)); + const int16x8_t d1h = + vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1)); + vec_vsx_st(d0h, 0, diff + c); + vec_vsx_st(d0l, 16, diff + c); + vec_vsx_st(d1h, 0, diff + c + 16); + vec_vsx_st(d1l, 16, diff + c + 16); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 16: + do { + const uint8x16_t s0 = vec_vsx_ld(0, src); + const uint8x16_t p0 = vec_vsx_ld(0, pred); + const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0)); + const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + vec_vsx_st(d0h, 0, diff); + vec_vsx_st(d0l, 16, diff); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 8: + do { + const uint8x16_t s0 = vec_vsx_ld(0, src); + const uint8x16_t p0 = vec_vsx_ld(0, pred); + const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + vec_vsx_st(d0h, 0, diff); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 4: + subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride); + if (r > 4) { + diff += 4 * diff_stride; + pred += 4 * pred_stride; + src += 4 * src_stride; + + subtract_block4x4(diff, diff_stride, + + src, src_stride, + + pred, pred_stride); + } + break; + default: + assert(0); // unreachable + } +} diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h index c6c7ce9f1..803d0377a 100644 --- a/vpx_dsp/ppc/types_vsx.h +++ b/vpx_dsp/ppc/types_vsx.h @@ -68,6 +68,13 @@ static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, #endif #endif +static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) { + const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a); + const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride); + + return (uint8x16_t)vec_mergeh(a0, a1); +} + static const uint8x16_t vec_zeros_u8 = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 }; diff --git a/vpx_dsp/ppc/variance_vsx.c b/vpx_dsp/ppc/variance_vsx.c index d3f257b63..50311d1b0 100644 --- a/vpx_dsp/ppc/variance_vsx.c +++ b/vpx_dsp/ppc/variance_vsx.c @@ -14,13 +14,6 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/ppc/types_vsx.h" -static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) { - const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a); - const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride); - - return (uint8x16_t)vec_mergeh(a0, a1); -} - uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride) { int distortion; diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 573d6fef1..c12dab736 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -342,6 +342,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/sad_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/subtract_sse2.asm DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c +DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 9661f3bd8..f237e5503 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -718,7 +718,7 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") { # Block subtraction # add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vpx_subtract_block neon msa mmi sse2/; +specialize qw/vpx_subtract_block neon msa mmi sse2 vsx/; # # Single block SAD -- 2.40.0