From 30d9a1916ca1409497f9bd903fa18d4265621266 Mon Sep 17 00:00:00 2001 From: Scott LaVarnway Date: Mon, 28 Aug 2017 07:26:08 -0700 Subject: [PATCH] vpxdsp: [x86] add highbd_h_predictor functions C vs SSE2 speed gains: _4x4 : ~8.12x _8x8 : ~9.71x _16x16 : ~8.21x _32x32 : ~5.0x BUG=webm:1422 Change-Id: I5e8a1ed4db7b8dc539b3e2a728b0b34d8b4b1993 --- test/test_intra_pred_speed.cc | 22 +-- test/vp9_intrapred_test.cc | 24 ++++ vpx_dsp/vpx_dsp.mk | 1 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 8 +- vpx_dsp/x86/highbd_intrapred_intrin_sse2.c | 159 +++++++++++++++++++++ 5 files changed, 200 insertions(+), 14 deletions(-) create mode 100644 vpx_dsp/x86/highbd_intrapred_intrin_sse2.c diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index 23fce335a..b35727f2c 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -482,25 +482,27 @@ HIGHBD_INTRA_PRED_TEST( #if HAVE_SSE2 HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2, NULL, NULL, NULL, - vpx_highbd_v_predictor_4x4_sse2, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, vpx_highbd_tm_predictor_4x4_c) + vpx_highbd_v_predictor_4x4_sse2, + vpx_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_highbd_tm_predictor_4x4_c) HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_sse2, NULL, NULL, NULL, - vpx_highbd_v_predictor_8x8_sse2, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2) + vpx_highbd_v_predictor_8x8_sse2, + vpx_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, + NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2) HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_sse2, NULL, NULL, NULL, - vpx_highbd_v_predictor_16x16_sse2, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, - vpx_highbd_tm_predictor_16x16_sse2) + vpx_highbd_v_predictor_16x16_sse2, + vpx_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL, + NULL, NULL, NULL, vpx_highbd_tm_predictor_16x16_sse2) HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_sse2, NULL, NULL, NULL, - vpx_highbd_v_predictor_32x32_sse2, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, - vpx_highbd_tm_predictor_32x32_sse2) + vpx_highbd_v_predictor_32x32_sse2, + vpx_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL, + NULL, NULL, NULL, vpx_highbd_tm_predictor_32x32_sse2) #endif // HAVE_SSE2 #if HAVE_NEON diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index bee0213ea..79ba56199 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -487,6 +487,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_tm_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 8), HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, @@ -515,6 +523,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_tm_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 10), HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, @@ -543,6 +559,14 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_tm_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2, &vpx_highbd_tm_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2, + &vpx_highbd_h_predictor_4x4_c, 4, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2, + &vpx_highbd_h_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2, + &vpx_highbd_h_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2, + &vpx_highbd_h_predictor_32x32_c, 32, 12), HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2, &vpx_highbd_v_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2, diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index aaf9e25ff..4bb942db5 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -56,6 +56,7 @@ DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm +DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c endif # CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 51ec2e50e..ede556f22 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -195,7 +195,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_h_predictor_4x4 neon/; + specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -230,7 +230,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_h_predictor_8x8 neon/; + specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -265,7 +265,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_h_predictor_16x16 neon/; + specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -300,7 +300,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_h_predictor_32x32 neon/; + specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; diff --git a/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c new file mode 100644 index 000000000..059f900ff --- /dev/null +++ b/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include // SSE2 + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +// ----------------------------------------------------------------------------- + +void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} + +void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + (void)above; + (void)bd; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6)); + dst += stride; + _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7)); +} + +static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)*dst, val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + *dst += stride; +} + +void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_16_unpacklo(&dst, stride, &row0); + h_store_16_unpacklo(&dst, stride, &row1); + h_store_16_unpacklo(&dst, stride, &row2); + h_store_16_unpacklo(&dst, stride, &row3); + h_store_16_unpackhi(&dst, stride, &row4); + h_store_16_unpackhi(&dst, stride, &row5); + h_store_16_unpackhi(&dst, stride, &row6); + h_store_16_unpackhi(&dst, stride, &row7); + } +} + +static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpacklo_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride, + const __m128i *row) { + const __m128i val = _mm_unpackhi_epi64(*row, *row); + _mm_store_si128((__m128i *)(*dst), val); + _mm_store_si128((__m128i *)(*dst + 8), val); + _mm_store_si128((__m128i *)(*dst + 16), val); + _mm_store_si128((__m128i *)(*dst + 24), val); + *dst += stride; +} + +void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + const __m128i left_u16 = _mm_load_si128((const __m128i *)left); + const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0); + const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55); + const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa); + const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff); + const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0); + const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55); + const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa); + const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff); + h_store_32_unpacklo(&dst, stride, &row0); + h_store_32_unpacklo(&dst, stride, &row1); + h_store_32_unpacklo(&dst, stride, &row2); + h_store_32_unpacklo(&dst, stride, &row3); + h_store_32_unpackhi(&dst, stride, &row4); + h_store_32_unpackhi(&dst, stride, &row5); + h_store_32_unpackhi(&dst, stride, &row6); + h_store_32_unpackhi(&dst, stride, &row7); + } +} -- 2.40.0