From 57c4711b5c87c3d2d25d37177ec6f8b8119e3ac4 Mon Sep 17 00:00:00 2001 From: Yi Luo Date: Tue, 2 Aug 2016 09:57:08 -0700 Subject: [PATCH] Optimization EXT_INTRA's filtered intra predictor (SSE4.1) - Add unit tests to verify the bit-exact result. - In speed test, function speed (for each mode/tx_size) improves about 23%~35%. - On E5-2680, park_joy_1080p, 10 frames, --kf-max-dist=1, encoding time improves about 1%~2%. Change-Id: Id89f313d44eea562c02e775a6253dc4df7e046a9 --- test/reconintra_predictors_test.cc | 186 +++++++++ test/test.mk | 4 + vp10/common/intra_filters.h | 67 ++++ vp10/common/reconintra.c | 128 +++---- vp10/common/vp10_rtcd_defs.pl | 24 ++ vp10/common/x86/reconintra_sse4.c | 593 +++++++++++++++++++++++++++++ vp10/vp10_common.mk | 5 + 7 files changed, 934 insertions(+), 73 deletions(-) create mode 100644 test/reconintra_predictors_test.cc create mode 100644 vp10/common/intra_filters.h create mode 100644 vp10/common/x86/reconintra_sse4.c diff --git a/test/reconintra_predictors_test.cc b/test/reconintra_predictors_test.cc new file mode 100644 index 000000000..38720baa8 --- /dev/null +++ b/test/reconintra_predictors_test.cc @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vp10_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vp10/common/enums.h" + +namespace { + +using std::tr1::tuple; +using libvpx_test::ACMRandom; + +typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left); + +// Note: +// Test parameter list: +// Reference predictor, optimized predictor, prediction mode, block size +// +typedef tuple PredFuncMode; +typedef tuple PredParams; + +const int MaxBlkSize = 32; + +// By default, disable speed test +#define PREDICTORS_SPEED_TEST (0) + +#if PREDICTORS_SPEED_TEST +const int MaxTestNum = 100000; +#else +const int MaxTestNum = 100; +#endif + +class VP10IntraPredOptimzTest : public ::testing::TestWithParam { + public: + virtual ~VP10IntraPredOptimzTest() {} + virtual void SetUp() { + PredFuncMode funcMode = GET_PARAM(0); + predFuncRef_ = std::tr1::get<0>(funcMode); + predFunc_ = std::tr1::get<1>(funcMode); + mode_ = std::tr1::get<2>(funcMode); + blockSize_ = GET_PARAM(1); + + alloc_ = (uint8_t *)malloc((3 * MaxBlkSize + 2) * sizeof(alloc_[0])); + predRef_ = + (uint8_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(predRef_[0])); + pred_ = (uint8_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(pred_[0])); + } + + virtual void TearDown() { + delete[] alloc_; + delete[] predRef_; + delete[] pred_; + libvpx_test::ClearSystemState(); + } + + protected: + void RunTest() const { + int tstIndex = 0; + int stride = blockSize_; + uint8_t *left = alloc_; + uint8_t *above = alloc_ + MaxBlkSize + 1; + while (tstIndex < MaxTestNum) { + PrepareBuffer(); + predFuncRef_(predRef_, stride, blockSize_, &above[1], left); + ASM_REGISTER_STATE_CHECK( + predFunc_(pred_, stride, blockSize_, &above[1], left)); + DiffPred(tstIndex); + tstIndex += 1; + } + } + + void RunSpeedTestC() const { + int tstIndex = 0; + int stride = blockSize_; + uint8_t *left = alloc_; + uint8_t *above = alloc_ + MaxBlkSize + 1; + PrepareBuffer(); + while (tstIndex < MaxTestNum) { + predFuncRef_(predRef_, stride, blockSize_, &above[1], left); + tstIndex += 1; + } + } + + void RunSpeedTestSSE() const { + int tstIndex = 0; + int stride = blockSize_; + uint8_t *left = alloc_; + uint8_t *above = alloc_ + MaxBlkSize + 1; + PrepareBuffer(); + while (tstIndex < MaxTestNum) { + predFunc_(predRef_, stride, blockSize_, &above[1], left); + tstIndex += 1; + } + } + + private: + void PrepareBuffer() const { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int i = 0; + while (i < (3 * MaxBlkSize + 2)) { + alloc_[i] = rnd.Rand8(); + i += 1; + } + } + + void DiffPred(int testNum) const { + int i = 0; + while (i < blockSize_ * blockSize_) { + EXPECT_EQ(predRef_[i], pred_[i]) + << "Error at position: " << i << " " + << "Block size: " << blockSize_ << " " + << "Test number: " << testNum; + i += 1; + } + } + + Predictor predFunc_; + Predictor predFuncRef_; + int mode_; + int blockSize_; + uint8_t *alloc_; + uint8_t *pred_; + uint8_t *predRef_; +}; + +TEST_P(VP10IntraPredOptimzTest, BitExactCheck) { + RunTest(); +} + +#if PREDICTORS_SPEED_TEST +TEST_P(VP10IntraPredOptimzTest, SpeedCheckC) { + RunSpeedTestC(); +} + +TEST_P(VP10IntraPredOptimzTest, SpeedCheckSSE) { + RunSpeedTestSSE(); +} +#endif + +using std::tr1::make_tuple; + +const PredFuncMode kPredFuncMdArray[] = { + make_tuple(vp10_dc_filter_predictor_c, vp10_dc_filter_predictor_sse4_1, + DC_PRED), + make_tuple(vp10_v_filter_predictor_c, vp10_v_filter_predictor_sse4_1, + V_PRED), + make_tuple(vp10_h_filter_predictor_c, vp10_h_filter_predictor_sse4_1, + H_PRED), + make_tuple(vp10_d45_filter_predictor_c, vp10_d45_filter_predictor_sse4_1, + D45_PRED), + make_tuple(vp10_d135_filter_predictor_c, vp10_d135_filter_predictor_sse4_1, + D135_PRED), + make_tuple(vp10_d117_filter_predictor_c, vp10_d117_filter_predictor_sse4_1, + D117_PRED), + make_tuple(vp10_d153_filter_predictor_c, vp10_d153_filter_predictor_sse4_1, + D153_PRED), + make_tuple(vp10_d207_filter_predictor_c, vp10_d207_filter_predictor_sse4_1, + D207_PRED), + make_tuple(vp10_d63_filter_predictor_c, vp10_d63_filter_predictor_sse4_1, + D63_PRED), + make_tuple(vp10_tm_filter_predictor_c, vp10_tm_filter_predictor_sse4_1, + TM_PRED), +}; + +const int kBlkSize[] = {4, 8, 16, 32}; + +INSTANTIATE_TEST_CASE_P( + SSE4_1, VP10IntraPredOptimzTest, + ::testing::Combine( + ::testing::ValuesIn(kPredFuncMdArray), + ::testing::ValuesIn(kBlkSize))); + +} // namespace diff --git a/test/test.mk b/test/test.mk index 4b4752f01..346a9babd 100644 --- a/test/test.mk +++ b/test/test.mk @@ -149,6 +149,10 @@ LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_wedge_utils_test.cc endif +ifeq ($(CONFIG_EXT_INTRA),yes) +LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += reconintra_predictors_test.cc +endif + ifeq ($(CONFIG_OBMC),yes) LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += obmc_sad_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += obmc_variance_test.cc diff --git a/vp10/common/intra_filters.h b/vp10/common/intra_filters.h new file mode 100644 index 000000000..664a7d6d1 --- /dev/null +++ b/vp10/common/intra_filters.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP10_COMMON_INTRA_FILTERS_H_ +#define VP10_COMMON_INTRA_FILTERS_H_ + +#define FILTER_INTRA_PREC_BITS (10) + +static int filter_intra_taps_4[TX_SIZES][INTRA_MODES][4] = { + { + {735, 881, -537, -54}, + {1005, 519, -488, -11}, + {383, 990, -343, -6}, + {442, 805, -542, 319}, + {658, 616, -133, -116}, + {875, 442, -141, -151}, + {386, 741, -23, -80}, + {390, 1027, -446, 51}, + {679, 606, -523, 262}, + {903, 922, -778, -23}, + }, + { + {648, 803, -444, 16}, + {972, 620, -576, 7}, + {561, 967, -499, -5}, + {585, 762, -468, 144}, + {596, 619, -182, -9}, + {895, 459, -176, -153}, + {557, 722, -126, -129}, + {601, 839, -523, 105}, + {562, 709, -499, 251}, + {803, 872, -695, 43}, + }, + { + {423, 728, -347, 111}, + {963, 685, -665, 23}, + {281, 1024, -480, 216}, + {640, 596, -437, 78}, + {429, 669, -259, 99}, + {740, 646, -415, 23}, + {568, 771, -346, 40}, + {404, 833, -486, 209}, + {398, 712, -423, 307}, + {939, 935, -887, 17}, + }, + { + {477, 737, -393, 150}, + {881, 630, -546, 67}, + {506, 984, -443, -20}, + {114, 459, -270, 528}, + {433, 528, 14, 3}, + {837, 470, -301, -30}, + {181, 777, 89, -107}, + {-29, 716, -232, 259}, + {589, 646, -495, 255}, + {740, 884, -728, 77}, + }, +}; + +#endif // VP10_COMMON_INTRA_FILTERS_H_ diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c index cdcca4a41..19d0c3d48 100644 --- a/vp10/common/reconintra.c +++ b/vp10/common/reconintra.c @@ -10,6 +10,7 @@ #include +#include "./vp10_rtcd.h" #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_ports/system_state.h" @@ -20,7 +21,9 @@ #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" #include "vpx_ports/vpx_once.h" - +#if CONFIG_EXT_INTRA +#include "vp10/common/intra_filters.h" +#endif #include "vp10/common/reconintra.h" #include "vp10/common/onyxc_int.h" @@ -390,7 +393,6 @@ static void vp10_init_intra_predictors_internal(void) { } #if CONFIG_EXT_INTRA -#define FILTER_INTRA_PREC_BITS 10 static const uint8_t ext_intra_extend_modes[FILTER_INTRA_MODES] = { NEED_LEFT | NEED_ABOVE, // FILTER_DC @@ -719,57 +721,6 @@ static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, } } -static int filter_intra_taps_4[TX_SIZES][INTRA_MODES][4] = { - { - {735, 881, -537, -54}, - {1005, 519, -488, -11}, - {383, 990, -343, -6}, - {442, 805, -542, 319}, - {658, 616, -133, -116}, - {875, 442, -141, -151}, - {386, 741, -23, -80}, - {390, 1027, -446, 51}, - {679, 606, -523, 262}, - {903, 922, -778, -23}, - }, - { - {648, 803, -444, 16}, - {972, 620, -576, 7}, - {561, 967, -499, -5}, - {585, 762, -468, 144}, - {596, 619, -182, -9}, - {895, 459, -176, -153}, - {557, 722, -126, -129}, - {601, 839, -523, 105}, - {562, 709, -499, 251}, - {803, 872, -695, 43}, - }, - { - {423, 728, -347, 111}, - {963, 685, -665, 23}, - {281, 1024, -480, 216}, - {640, 596, -437, 78}, - {429, 669, -259, 99}, - {740, 646, -415, 23}, - {568, 771, -346, 40}, - {404, 833, -486, 209}, - {398, 712, -423, 307}, - {939, 935, -887, 17}, - }, - { - {477, 737, -393, 150}, - {881, 630, -546, 67}, - {506, 984, -443, -20}, - {114, 459, -270, 528}, - {433, 528, 14, 3}, - {837, 470, -301, -30}, - {181, 777, 89, -107}, - {-29, 716, -232, 259}, - {589, 646, -495, 255}, - {740, 884, -728, 77}, - }, -}; - static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left, @@ -815,63 +766,94 @@ static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride, int bs, } } -static void dc_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { +void vp10_dc_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED); } -static void v_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +void vp10_v_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED); } -static void h_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +void vp10_h_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED); } -static void d45_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +void vp10_d45_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED); } -static void d135_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +void vp10_d135_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED); } -static void d117_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +void vp10_d117_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED); } -static void d153_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +void vp10_d153_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED); } -static void d207_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +void vp10_d207_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED); } -static void d63_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +void vp10_d63_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED); } -static void tm_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs, +void vp10_tm_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED); } -static void (*filter_intra_predictors[EXT_INTRA_MODES])(uint8_t *dst, - ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) = { - dc_filter_predictor, v_filter_predictor, h_filter_predictor, - d45_filter_predictor, d135_filter_predictor, d117_filter_predictor, - d153_filter_predictor, d207_filter_predictor, d63_filter_predictor, - tm_filter_predictor, -}; +static void filter_intra_predictors(int mode, uint8_t *dst, + ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + switch (mode) { + case DC_PRED: + vp10_dc_filter_predictor(dst, stride, bs, above, left); + break; + case V_PRED: + vp10_v_filter_predictor(dst, stride, bs, above, left); + break; + case H_PRED: + vp10_h_filter_predictor(dst, stride, bs, above, left); + break; + case D45_PRED: + vp10_d45_filter_predictor(dst, stride, bs, above, left); + break; + case D135_PRED: + vp10_d135_filter_predictor(dst, stride, bs, above, left); + break; + case D117_PRED: + vp10_d117_filter_predictor(dst, stride, bs, above, left); + break; + case D153_PRED: + vp10_d153_filter_predictor(dst, stride, bs, above, left); + break; + case D207_PRED: + vp10_d207_filter_predictor(dst, stride, bs, above, left); + break; + case D63_PRED: + vp10_d63_filter_predictor(dst, stride, bs, above, left); + break; + case TM_PRED: + vp10_tm_filter_predictor(dst, stride, bs, above, left); + break; + default: + assert(0); + } +} #if CONFIG_VP9_HIGHBITDEPTH static int highbd_intra_subpel_interp(int base, int shift, const uint16_t *ref, @@ -1491,8 +1473,8 @@ static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, #if CONFIG_EXT_INTRA if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) { - filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs, - const_above_row, left_col); + filter_intra_predictors(ext_intra_mode, dst, dst_stride, bs, + const_above_row, left_col); return; } diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl index 0ca48a317..6dbcc65c9 100644 --- a/vp10/common/vp10_rtcd_defs.pl +++ b/vp10/common/vp10_rtcd_defs.pl @@ -298,6 +298,30 @@ if (vpx_config("CONFIG_NEW_QUANT") eq "yes") { specialize qw/quantize_32x32_fp_nuq/; } +# EXT_INTRA predictor functions +if (vpx_config("CONFIG_EXT_INTRA") eq "yes") { + add_proto qw/void vp10_dc_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left"; + specialize qw/vp10_dc_filter_predictor sse4_1/; + add_proto qw/void vp10_v_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left"; + specialize qw/vp10_v_filter_predictor sse4_1/; + add_proto qw/void vp10_h_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left"; + specialize qw/vp10_h_filter_predictor sse4_1/; + add_proto qw/void vp10_d45_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left"; + specialize qw/vp10_d45_filter_predictor sse4_1/; + add_proto qw/void vp10_d135_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left"; + specialize qw/vp10_d135_filter_predictor sse4_1/; + add_proto qw/void vp10_d117_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left"; + specialize qw/vp10_d117_filter_predictor sse4_1/; + add_proto qw/void vp10_d153_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left"; + specialize qw/vp10_d153_filter_predictor sse4_1/; + add_proto qw/void vp10_d207_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left"; + specialize qw/vp10_d207_filter_predictor sse4_1/; + add_proto qw/void vp10_d63_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left"; + specialize qw/vp10_d63_filter_predictor sse4_1/; + add_proto qw/void vp10_tm_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left"; + specialize qw/vp10_tm_filter_predictor sse4_1/; +} + # High bitdepth functions if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # diff --git a/vp10/common/x86/reconintra_sse4.c b/vp10/common/x86/reconintra_sse4.c new file mode 100644 index 000000000..851d850e7 --- /dev/null +++ b/vp10/common/x86/reconintra_sse4.c @@ -0,0 +1,593 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include + +#include "./vp10_rtcd.h" +#include "vpx_ports/mem.h" +#include "vp10/common/enums.h" +#include "vp10/common/intra_filters.h" + +static INLINE void AddPixelsSmall(const uint8_t *above, const uint8_t *left, + __m128i *sum) { + const __m128i a = _mm_loadu_si128((const __m128i *)above); + const __m128i l = _mm_loadu_si128((const __m128i *)left); + const __m128i zero = _mm_setzero_si128(); + + __m128i u0 = _mm_unpacklo_epi8(a, zero); + __m128i u1 = _mm_unpacklo_epi8(l, zero); + + sum[0] = _mm_add_epi16(u0, u1); +} + +static INLINE int GetMeanValue4x4(const uint8_t *above, const uint8_t *left, + __m128i *params) { + const __m128i zero = _mm_setzero_si128(); + __m128i sum_vector, u; + uint16_t sum_value; + + AddPixelsSmall(above, left, &sum_vector); + + sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values + u = _mm_srli_si128(sum_vector, 2); + sum_vector = _mm_add_epi16(sum_vector, u); + + sum_value = _mm_extract_epi16(sum_vector, 0); + sum_value += 4; + sum_value >>= 3; + *params = _mm_set1_epi32(sum_value); + return sum_value; +} + +static INLINE int GetMeanValue8x8(const uint8_t *above, const uint8_t *left, + __m128i *params) { + const __m128i zero = _mm_setzero_si128(); + __m128i sum_vector, u; + uint16_t sum_value; + + AddPixelsSmall(above, left, &sum_vector); + + sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values + sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values + + u = _mm_srli_si128(sum_vector, 2); + sum_vector = _mm_add_epi16(sum_vector, u); + + sum_value = _mm_extract_epi16(sum_vector, 0); + sum_value += 8; + sum_value >>= 4; + *params = _mm_set1_epi32(sum_value); + return sum_value; +} + +static INLINE void AddPixelsLarge(const uint8_t *above, const uint8_t *left, + __m128i *sum) { + const __m128i a = _mm_loadu_si128((const __m128i *)above); + const __m128i l = _mm_loadu_si128((const __m128i *)left); + const __m128i zero = _mm_setzero_si128(); + + __m128i u0 = _mm_unpacklo_epi8(a, zero); + __m128i u1 = _mm_unpacklo_epi8(l, zero); + + sum[0] = _mm_add_epi16(u0, u1); + + u0 = _mm_unpackhi_epi8(a, zero); + u1 = _mm_unpackhi_epi8(l, zero); + + sum[0] = _mm_add_epi16(sum[0], u0); + sum[0] = _mm_add_epi16(sum[0], u1); +} + +static INLINE int GetMeanValue16x16(const uint8_t *above, const uint8_t *left, + __m128i *params) { + const __m128i zero = _mm_setzero_si128(); + __m128i sum_vector, u; + uint16_t sum_value; + + AddPixelsLarge(above, left, &sum_vector); + + sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values + sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values + + u = _mm_srli_si128(sum_vector, 2); + sum_vector = _mm_add_epi16(sum_vector, u); + + sum_value = _mm_extract_epi16(sum_vector, 0); + sum_value += 16; + sum_value >>= 5; + *params = _mm_set1_epi32(sum_value); + return sum_value; +} + +static INLINE int GetMeanValue32x32(const uint8_t *above, const uint8_t *left, + __m128i *params) { + const __m128i zero = _mm_setzero_si128(); + __m128i sum_vector[2], u; + uint16_t sum_value; + + AddPixelsLarge(above, left, &sum_vector[0]); + AddPixelsLarge(above + 16, left + 16, &sum_vector[1]); + + sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]); + sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 4 values + sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 2 values + + u = _mm_srli_si128(sum_vector[0], 2); + sum_vector[0] = _mm_add_epi16(sum_vector[0], u); + + sum_value = _mm_extract_epi16(sum_vector[0], 0); + sum_value += 32; + sum_value >>= 6; + *params = _mm_set1_epi32(sum_value); + return sum_value; +} + +// Note: +// params[4] : mean value, 4 int32_t repetition +// +static INLINE int CalcRefPixelsMeanValue(const uint8_t *above, + const uint8_t *left, + int bs, __m128i *params) { + int meanValue = 0; + switch (bs) { + case 4: + meanValue = GetMeanValue4x4(above, left, params); + break; + case 8: + meanValue = GetMeanValue8x8(above, left, params); + break; + case 16: + meanValue = GetMeanValue16x16(above, left, params); + break; + case 32: + meanValue = GetMeanValue32x32(above, left, params); + break; + default: + assert(0); + } + return meanValue; +} + +// Note: +// params[0-3] : 4-tap filter coefficients (int32_t per coefficient) +// +static INLINE void GetIntraFilterParams(int bs, int mode, __m128i *params) { + const TX_SIZE tx_size = (bs == 32) ? TX_32X32 : + ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4))); + // c0 + params[0] = _mm_set_epi32(filter_intra_taps_4[tx_size][mode][0], + filter_intra_taps_4[tx_size][mode][0], + filter_intra_taps_4[tx_size][mode][0], + filter_intra_taps_4[tx_size][mode][0]); + // c1 + params[1] = _mm_set_epi32(filter_intra_taps_4[tx_size][mode][1], + filter_intra_taps_4[tx_size][mode][1], + filter_intra_taps_4[tx_size][mode][1], + filter_intra_taps_4[tx_size][mode][1]); + // c2 + params[2] = _mm_set_epi32(filter_intra_taps_4[tx_size][mode][2], + filter_intra_taps_4[tx_size][mode][2], + filter_intra_taps_4[tx_size][mode][2], + filter_intra_taps_4[tx_size][mode][2]); + // c3 + params[3] = _mm_set_epi32(filter_intra_taps_4[tx_size][mode][3], + filter_intra_taps_4[tx_size][mode][3], + filter_intra_taps_4[tx_size][mode][3], + filter_intra_taps_4[tx_size][mode][3]); +} + +static const int maxBlkSize = 32; + +static INLINE void SavePred4x4(int *pred, const __m128i *mean, uint8_t *dst, + ptrdiff_t stride) { + const int predStride = (maxBlkSize << 1) + 1; + __m128i p0 = _mm_loadu_si128((const __m128i *)pred); + __m128i p1 = _mm_loadu_si128((const __m128i *)(pred + predStride)); + __m128i p2 = _mm_loadu_si128((const __m128i *)(pred + 2 * predStride)); + __m128i p3 = _mm_loadu_si128((const __m128i *)(pred + 3 * predStride)); + + p0 = _mm_add_epi32(p0, mean[0]); + p1 = _mm_add_epi32(p1, mean[0]); + p2 = _mm_add_epi32(p2, mean[0]); + p3 = _mm_add_epi32(p3, mean[0]); + + p0 = _mm_packus_epi32(p0, p1); + p1 = _mm_packus_epi32(p2, p3); + p0 = _mm_packus_epi16(p0, p1); + + *((int *)dst) = _mm_cvtsi128_si32(p0); + p0 = _mm_srli_si128(p0, 4); + *((int *)(dst + stride)) = _mm_cvtsi128_si32(p0); + p0 = _mm_srli_si128(p0, 4); + *((int *)(dst + 2 * stride)) = _mm_cvtsi128_si32(p0); + p0 = _mm_srli_si128(p0, 4); + *((int *)(dst + 3 * stride)) = _mm_cvtsi128_si32(p0); +} + +static void SavePred8x8(int *pred, const __m128i *mean, uint8_t *dst, + ptrdiff_t stride) { + const int predStride = (maxBlkSize << 1) + 1; + __m128i p0, p1, p2, p3; + int r = 0; + + while (r < 8) { + p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride)); + p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4)); + r += 1; + p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride)); + p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4)); + + p0 = _mm_add_epi32(p0, mean[0]); + p1 = _mm_add_epi32(p1, mean[0]); + p2 = _mm_add_epi32(p2, mean[0]); + p3 = _mm_add_epi32(p3, mean[0]); + + p0 = _mm_packus_epi32(p0, p1); + p1 = _mm_packus_epi32(p2, p3); + p0 = _mm_packus_epi16(p0, p1); + + _mm_storel_epi64((__m128i *)dst, p0); + dst += stride; + p0 = _mm_srli_si128(p0, 8); + _mm_storel_epi64((__m128i *)dst, p0); + dst += stride; + r += 1; + } +} + +static void SavePred16x16(int *pred, const __m128i *mean, uint8_t *dst, + ptrdiff_t stride) { + const int predStride = (maxBlkSize << 1) + 1; + __m128i p0, p1, p2, p3; + int r = 0; + + while (r < 16) { + p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride)); + p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4)); + p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8)); + p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12)); + + p0 = _mm_add_epi32(p0, mean[0]); + p1 = _mm_add_epi32(p1, mean[0]); + p2 = _mm_add_epi32(p2, mean[0]); + p3 = _mm_add_epi32(p3, mean[0]); + + p0 = _mm_packus_epi32(p0, p1); + p1 = _mm_packus_epi32(p2, p3); + p0 = _mm_packus_epi16(p0, p1); + + _mm_storel_epi64((__m128i *)dst, p0); + p0 = _mm_srli_si128(p0, 8); + _mm_storel_epi64((__m128i *)(dst + 8), p0); + dst += stride; + r += 1; + } +} + +static void SavePred32x32(int *pred, const __m128i *mean, uint8_t *dst, + ptrdiff_t stride) { + const int predStride = (maxBlkSize << 1) + 1; + __m128i p0, p1, p2, p3, p4, p5, p6, p7; + int r = 0; + + while (r < 32) { + p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride)); + p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4)); + p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8)); + p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12)); + + p4 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 16)); + p5 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 20)); + p6 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 24)); + p7 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 28)); + + p0 = _mm_add_epi32(p0, mean[0]); + p1 = _mm_add_epi32(p1, mean[0]); + p2 = _mm_add_epi32(p2, mean[0]); + p3 = _mm_add_epi32(p3, mean[0]); + + p4 = _mm_add_epi32(p4, mean[0]); + p5 = _mm_add_epi32(p5, mean[0]); + p6 = _mm_add_epi32(p6, mean[0]); + p7 = _mm_add_epi32(p7, mean[0]); + + p0 = _mm_packus_epi32(p0, p1); + p1 = _mm_packus_epi32(p2, p3); + p0 = _mm_packus_epi16(p0, p1); + + p4 = _mm_packus_epi32(p4, p5); + p5 = _mm_packus_epi32(p6, p7); + p4 = _mm_packus_epi16(p4, p5); + + _mm_storel_epi64((__m128i *)dst, p0); + p0 = _mm_srli_si128(p0, 8); + _mm_storel_epi64((__m128i *)(dst + 8), p0); + + _mm_storel_epi64((__m128i *)(dst + 16), p4); + p4 = _mm_srli_si128(p4, 8); + _mm_storel_epi64((__m128i *)(dst + 24), p4); + + dst += stride; + r += 1; + } +} + +static void SavePrediction(int *pred, const __m128i *mean, int bs, uint8_t *dst, + ptrdiff_t stride) { + switch (bs) { + case 4: + SavePred4x4(pred, mean, dst, stride); + break; + case 8: + SavePred8x8(pred, mean, dst, stride); + break; + case 16: + SavePred16x16(pred, mean, dst, stride); + break; + case 32: + SavePred32x32(pred, mean, dst, stride); + break; + default: + assert(0); + } +} + +typedef void (*ProducePixelsFunc)(__m128i *p, const __m128i *prm, int *pred, + const int predStride); + +static void ProduceFourPixels(__m128i *p, const __m128i *prm, int *pred, + const int predStride) { + __m128i u0, u1, u2; + int c0 = _mm_extract_epi32(prm[1], 0); + int x = *(pred + predStride); + int sum; + + u0 = _mm_mullo_epi32(p[0], prm[2]); + u1 = _mm_mullo_epi32(p[1], prm[0]); + u2 = _mm_mullo_epi32(p[2], prm[3]); + + u0 = _mm_add_epi32(u0, u1); + u0 = _mm_add_epi32(u0, u2); + + sum = _mm_extract_epi32(u0, 0); + sum += c0 * x; + x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); + *(pred + predStride + 1) = x; + + sum = _mm_extract_epi32(u0, 1); + sum += c0 * x; + x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); + *(pred + predStride + 2) = x; + + sum = _mm_extract_epi32(u0, 2); + sum += c0 * x; + x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); + *(pred + predStride + 3) = x; + + sum = _mm_extract_epi32(u0, 3); + sum += c0 * x; + x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); + *(pred + predStride + 4) = x; +} + +static void ProduceThreePixels(__m128i *p, const __m128i *prm, int *pred, + const int predStride) { + __m128i u0, u1, u2; + int c0 = _mm_extract_epi32(prm[1], 0); + int x = *(pred + predStride); + int sum; + + u0 = _mm_mullo_epi32(p[0], prm[2]); + u1 = _mm_mullo_epi32(p[1], prm[0]); + u2 = _mm_mullo_epi32(p[2], prm[3]); + + u0 = _mm_add_epi32(u0, u1); + u0 = _mm_add_epi32(u0, u2); + + sum = _mm_extract_epi32(u0, 0); + sum += c0 * x; + x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); + *(pred + predStride + 1) = x; + + sum = _mm_extract_epi32(u0, 1); + sum += c0 * x; + x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); + *(pred + predStride + 2) = x; + + sum = _mm_extract_epi32(u0, 2); + sum += c0 * x; + x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); + *(pred + predStride + 3) = x; +} + +static void ProduceTwoPixels(__m128i *p, const __m128i *prm, int *pred, + const int predStride) { + __m128i u0, u1, u2; + int c0 = _mm_extract_epi32(prm[1], 0); + int x = *(pred + predStride); + int sum; + + u0 = _mm_mullo_epi32(p[0], prm[2]); + u1 = _mm_mullo_epi32(p[1], prm[0]); + u2 = _mm_mullo_epi32(p[2], prm[3]); + + u0 = _mm_add_epi32(u0, u1); + u0 = _mm_add_epi32(u0, u2); + + sum = _mm_extract_epi32(u0, 0); + sum += c0 * x; + x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); + *(pred + predStride + 1) = x; + + sum = _mm_extract_epi32(u0, 1); + sum += c0 * x; + x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); + *(pred + predStride + 2) = x; +} + +static void ProduceOnePixels(__m128i *p, const __m128i *prm, int *pred, + const int predStride) { + __m128i u0, u1, u2; + int c0 = _mm_extract_epi32(prm[1], 0); + int x = *(pred + predStride); + int sum; + + u0 = _mm_mullo_epi32(p[0], prm[2]); + u1 = _mm_mullo_epi32(p[1], prm[0]); + u2 = _mm_mullo_epi32(p[2], prm[3]); + + u0 = _mm_add_epi32(u0, u1); + u0 = _mm_add_epi32(u0, u2); + + sum = _mm_extract_epi32(u0, 0); + sum += c0 * x; + x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS); + *(pred + predStride + 1) = x; +} + +static ProducePixelsFunc prodPixelsFuncTab[4] = { + ProduceOnePixels, ProduceTwoPixels, ProduceThreePixels, ProduceFourPixels}; + +static void ProducePixels(int *pred, const __m128i *prm, int remain) { + __m128i p[3]; + const int predStride = (maxBlkSize << 1) + 1; + int index; + + p[0] = _mm_loadu_si128((const __m128i *)pred); + p[1] = _mm_loadu_si128((const __m128i *)(pred + 1)); + p[2] = _mm_loadu_si128((const __m128i *)(pred + 2)); + + if (remain <= 2) { + return; + } + if (remain > 5) { + index = 3; + } else { + index = remain - 3; + } + prodPixelsFuncTab[index](p, prm, pred, predStride); +} + +// Note: +// At column index c, the remaining pixels are R = 2 * bs + 1 - r - c +// the number of pixels to produce is R - 2 = 2 * bs - r - c - 1 +static void GeneratePrediction(const uint8_t *above, const uint8_t *left, + const int bs, const __m128i *prm, int meanValue, + uint8_t *dst, ptrdiff_t stride) { + int pred[33][65]; + int r, c, colBound; + int remainings; + + for (r = 0; r < bs; ++r) { + pred[r + 1][0] = (int)left[r] - meanValue; + } + + above -= 1; + for (c = 0; c < 2 * bs + 1; ++c) { + pred[0][c] = (int)above[c] - meanValue; + } + + r = 0; + c = 0; + while (r < bs) { + colBound = (bs << 1) - r; + for (c = 0; c < colBound; c += 4) { + remainings = colBound - c + 1; + ProducePixels(&pred[r][c], prm, remainings); + } + r += 1; + } + + SavePrediction(&pred[1][1], &prm[4], bs, dst, stride); +} + +static void FilterPrediction(const uint8_t *above, const uint8_t *left, int bs, + __m128i *prm, uint8_t *dst, ptrdiff_t stride) { + int meanValue = 0; + meanValue = CalcRefPixelsMeanValue(above, left, bs, &prm[4]); + GeneratePrediction(above, left, bs, prm, meanValue, dst, stride); +} + +void vp10_dc_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + __m128i prm[5]; + GetIntraFilterParams(bs, DC_PRED, &prm[0]); + FilterPrediction(above, left, bs, prm, dst, stride); +} + +void vp10_v_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + __m128i prm[5]; + GetIntraFilterParams(bs, V_PRED, &prm[0]); + FilterPrediction(above, left, bs, prm, dst, stride); +} + +void vp10_h_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, const uint8_t *left) { + __m128i prm[5]; + GetIntraFilterParams(bs, H_PRED, &prm[0]); + FilterPrediction(above, left, bs, prm, dst, stride); +} + +void vp10_d45_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + __m128i prm[5]; + GetIntraFilterParams(bs, D45_PRED, &prm[0]); + FilterPrediction(above, left, bs, prm, dst, stride); +} + +void vp10_d135_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + __m128i prm[5]; + GetIntraFilterParams(bs, D135_PRED, &prm[0]); + FilterPrediction(above, left, bs, prm, dst, stride); +} + +void vp10_d117_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + __m128i prm[5]; + GetIntraFilterParams(bs, D117_PRED, &prm[0]); + FilterPrediction(above, left, bs, prm, dst, stride); +} + +void vp10_d153_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + __m128i prm[5]; + GetIntraFilterParams(bs, D153_PRED, &prm[0]); + FilterPrediction(above, left, bs, prm, dst, stride); +} + +void vp10_d207_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + __m128i prm[5]; + GetIntraFilterParams(bs, D207_PRED, &prm[0]); + FilterPrediction(above, left, bs, prm, dst, stride); +} + +void vp10_d63_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + __m128i prm[5]; + GetIntraFilterParams(bs, D63_PRED, &prm[0]); + FilterPrediction(above, left, bs, prm, dst, stride); +} + +void vp10_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs, + const uint8_t *above, + const uint8_t *left) { + __m128i prm[5]; + GetIntraFilterParams(bs, TM_PRED, &prm[0]); + FilterPrediction(above, left, bs, prm, dst, stride); +} diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk index 44e0635c2..295ec966b 100644 --- a/vp10/vp10_common.mk +++ b/vp10/vp10_common.mk @@ -118,6 +118,11 @@ VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c endif +ifeq ($(CONFIG_EXT_INTRA),yes) +VP10_COMMON_SRCS-yes += common/intra_filters.h +VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/reconintra_sse4.c +endif + VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_inv_txfm_sse2.c VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_inv_txfm_sse2.h -- 2.50.1