From e357b9efe08eca4c878e2a43dcde4bd4f7fb39a7 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Fri, 1 Jul 2016 16:02:41 -0700 Subject: [PATCH] Support measure distortion in the pixel domain Use pixel domain distortion metric in speed 0. This improves the compression performance by 0.3% for both low and high resolution test sets. Change-Id: I5b5b7115960de73f0b5e5d0c69db305e490e6f1d --- test/sum_squares_test.cc | 115 ++++++++++++++++++ test/test.mk | 1 + vp9/encoder/vp9_rdopt.c | 201 ++++++++++++++++++++++++------- vp9/encoder/vp9_speed_features.c | 3 + vp9/encoder/vp9_speed_features.h | 5 + vpx_dsp/sum_squares.c | 27 +++++ vpx_dsp/vpx_dsp.mk | 2 + vpx_dsp/vpx_dsp_rtcd_defs.pl | 3 + vpx_dsp/x86/sum_squares_sse2.c | 128 ++++++++++++++++++++ 9 files changed, 439 insertions(+), 46 deletions(-) create mode 100644 test/sum_squares_test.cc create mode 100644 vpx_dsp/sum_squares.c create mode 100644 vpx_dsp/x86/sum_squares_sse2.c diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc new file mode 100644 index 000000000..0a7734759 --- /dev/null +++ b/test/sum_squares_test.cc @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "test/util.h" +#include "vpx_ports/mem.h" + +using libvpx_test::ACMRandom; + +namespace { +const int kNumIterations = 10000; + +typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int size); +typedef std::tr1::tuple SumSquaresParam; + +class SumSquaresTest : public ::testing::TestWithParam { + public: + virtual ~SumSquaresTest() {} + virtual void SetUp() { + ref_func_ = GET_PARAM(0); + tst_func_ = GET_PARAM(1); + } + + virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + SSI16Func ref_func_; + SSI16Func tst_func_; +}; + +TEST_P(SumSquaresTest, OperationCheck) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, int16_t, src[256 * 256]); + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + + for (int k = 0; k < kNumIterations; k++) { + const int size = 4 << rnd(6); // Up to 128x128 + int stride = 4 << rnd(7); // Up to 256 stride + while (stride < size) { // Make sure it's valid + stride = 4 << rnd(7); + } + + for (int i = 0; i < size; ++i) { + for (int j = 0; j < size; ++j) { + src[i * stride + j] = rnd(2) ? rnd(limit) : -rnd(limit); + } + } + + const uint64_t res_ref = ref_func_(src, stride, size); + uint64_t res_tst; + ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size)); + + ASSERT_EQ(res_ref, res_tst) + << "Error: Sum Squares Test" + << " C output does not match optimized output."; + } +} + +TEST_P(SumSquaresTest, ExtremeValues) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + DECLARE_ALIGNED(16, int16_t, src[256 * 256]); + const int msb = 11; // Up to 12 bit input + const int limit = 1 << (msb + 1); + + for (int k = 0; k < kNumIterations; k++) { + const int size = 4 << rnd(6); // Up to 128x128 + int stride = 4 << rnd(7); // Up to 256 stride + while (stride < size) { // Make sure it's valid + stride = 4 << rnd(7); + } + + const int val = rnd(2) ? limit - 1 : -(limit - 1); + for (int i = 0; i < size; ++i) { + for (int j = 0; j < size; ++j) { + src[i * stride + j] = val; + } + } + + const uint64_t res_ref = ref_func_(src, stride, size); + uint64_t res_tst; + ASM_REGISTER_STATE_CHECK(res_tst = tst_func_(src, stride, size)); + + ASSERT_EQ(res_ref, res_tst) + << "Error: Sum Squares Test" + << " C output does not match optimized output."; + } +} + +using std::tr1::make_tuple; + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P( + SSE2, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_sse2))); +#endif // HAVE_SSE2 +} // namespace diff --git a/test/test.mk b/test/test.mk index 2d50ce813..81381e9f3 100644 --- a/test/test.mk +++ b/test/test.mk @@ -170,6 +170,7 @@ endif # VP9 ## Multi-codec / unconditional whitebox tests. LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sum_squares_test.cc TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 28530386c..a2426b17d 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -64,6 +64,7 @@ typedef struct { } REF_DEFINITION; struct rdcost_block_args { + const VP9_COMP *cpi; MACROBLOCK *x; ENTROPY_CONTEXT t_above[16]; ENTROPY_CONTEXT t_left[16]; @@ -463,38 +464,123 @@ static int cost_coeffs(MACROBLOCK *x, return cost; } -static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size, +static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, int block, + int blk_row, int blk_col, TX_SIZE tx_size, int64_t *out_dist, int64_t *out_sse) { - const int ss_txfrm_size = tx_size << 1; MACROBLOCKD* const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; - int64_t this_sse; - int shift = tx_size == TX_32X32 ? 0 : 2; - tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); - tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + + if (cpi->sf.txfm_domain_distortion) { + const int ss_txfrm_size = tx_size << 1; + int64_t this_sse; + const int shift = tx_size == TX_32X32 ? 0 : 2; + const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + const tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); #if CONFIG_VP9_HIGHBITDEPTH - const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8; - *out_dist = vp9_highbd_block_error_dispatch(coeff, dqcoeff, - 16 << ss_txfrm_size, - &this_sse, bd) >> shift; + const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8; + *out_dist = vp9_highbd_block_error_dispatch( + coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse, bd) >> + shift; #else - *out_dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, - &this_sse) >> shift; + *out_dist = + vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size, &this_sse) >> + shift; #endif // CONFIG_VP9_HIGHBITDEPTH - *out_sse = this_sse >> shift; + *out_sse = this_sse >> shift; - if (x->skip_encode && !is_inter_block(xd->mi[0])) { - // TODO(jingning): tune the model to better capture the distortion. - int64_t p = (pd->dequant[1] * pd->dequant[1] * - (1 << ss_txfrm_size)) >> + if (x->skip_encode && !is_inter_block(xd->mi[0])) { + // TODO(jingning): tune the model to better capture the distortion. + const int64_t p = + (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >> +#if CONFIG_VP9_HIGHBITDEPTH + (shift + 2 + (bd - 8) * 2); +#else + (shift + 2); +#endif // CONFIG_VP9_HIGHBITDEPTH + *out_dist += (p >> 4); + *out_sse += p; + } + } else { + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const int bs = 4 * num_4x4_blocks_wide_lookup[tx_bsize]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + const int src_idx = 4 * (blk_row * src_stride + blk_col); + const int dst_idx = 4 * (blk_row * dst_stride + blk_col); + const uint8_t *src = &p->src.buf[src_idx]; + const uint8_t *dst = &pd->dst.buf[dst_idx]; + const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + const uint16_t *eob = &p->eobs[block]; + unsigned int tmp; + + cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &tmp); + *out_sse = (int64_t)tmp * 16; + + if (*eob) { #if CONFIG_VP9_HIGHBITDEPTH - (shift + 2 + (bd - 8) * 2); + DECLARE_ALIGNED(16, uint16_t, recon16[1024]); + uint8_t *recon = (uint8_t *)recon16; #else - (shift + 2); + DECLARE_ALIGNED(16, uint8_t, recon[1024]); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + recon = CONVERT_TO_BYTEPTR(recon); + vpx_highbd_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, + bs, bs, xd->bd); + if (xd->lossless) { + vp9_highbd_iwht4x4_add(dqcoeff, recon, 32, *eob, xd->bd); + } else { + switch (tx_size) { + case TX_4X4: + vp9_highbd_idct4x4_add(dqcoeff, recon, 32, *eob, xd->bd); + break; + case TX_8X8: + vp9_highbd_idct8x8_add(dqcoeff, recon, 32, *eob, xd->bd); + break; + case TX_16X16: + vp9_highbd_idct16x16_add(dqcoeff, recon, 32, *eob, xd->bd); + break; + case TX_32X32: + vp9_highbd_idct32x32_add(dqcoeff, recon, 32, *eob, xd->bd); + break; + default: + assert(0 && "Invalid transform size"); + } + } + } else { +#endif // CONFIG_VP9_HIGHBITDEPTH + vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs); + switch (tx_size) { + case TX_32X32: + vp9_idct32x32_add(dqcoeff, recon, 32, *eob); + break; + case TX_16X16: + vp9_idct16x16_add(dqcoeff, recon, 32, *eob); + break; + case TX_8X8: + vp9_idct8x8_add(dqcoeff, recon, 32, *eob); + break; + case TX_4X4: + // this is like vp9_short_idct4x4 but has a special case around + // eob<=1, which is significant (not just an optimization) for + // the lossless case. + x->itxm_add(dqcoeff, recon, 32, *eob); + break; + default: + assert(0 && "Invalid transform size"); + break; + } +#if CONFIG_VP9_HIGHBITDEPTH + } #endif // CONFIG_VP9_HIGHBITDEPTH - *out_dist += (p >> 4); - *out_sse += p; + + cpi->fn_ptr[tx_bsize].vf(src, src_stride, recon, 32, &tmp); + } + + *out_dist = (int64_t)tmp * 16; } } @@ -506,9 +592,8 @@ static int rate_block(int plane, int block, int row, int col, args->use_fast_coef_costing); } -static void block_rd_txfm(int plane, int block, int row, int col, - BLOCK_SIZE plane_bsize, - TX_SIZE tx_size, void *arg) { +static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, + BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg) { struct rdcost_block_args *args = arg; MACROBLOCK *const x = args->x; MACROBLOCKD *const xd = &x->e_mbd; @@ -523,20 +608,47 @@ static void block_rd_txfm(int plane, int block, int row, int col, if (!is_inter_block(mi)) { struct encode_b_args arg = {x, NULL, &mi->skip}; - vp9_encode_block_intra(plane, block, row, col, plane_bsize, tx_size, &arg); - dist_block(x, plane, block, tx_size, &dist, &sse); + vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size, + &arg); + if (args->cpi->sf.txfm_domain_distortion) { + dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist, + &sse); + } else { + const int bs = 4 << tx_size; + const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size]; + const vpx_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int src_stride = p->src.stride; + const int dst_stride = pd->dst.stride; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize]; + const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)]; + const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)]; + const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)]; + unsigned int tmp; + sse = vpx_sum_squares_2d_i16(diff, diff_stride, bs); +#if CONFIG_VP9_HIGHBITDEPTH + if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8)) + sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); +#endif // CONFIG_VP9_HIGHBITDEPTH + sse = sse * 16; + variance(src, src_stride, dst, dst_stride, &tmp); + dist = (int64_t)tmp * 16; + } } else if (max_txsize_lookup[plane_bsize] == tx_size) { if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == SKIP_TXFM_NONE) { // full forward transform and quantization - vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size); - dist_block(x, plane, block, tx_size, &dist, &sse); + vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); + dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist, + &sse); } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == SKIP_TXFM_AC_ONLY) { // compute DC coefficient tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block); tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block); - vp9_xform_quant_dc(x, plane, block, row, col, plane_bsize, tx_size); + vp9_xform_quant_dc(x, plane, block, blk_row, blk_col, plane_bsize, + tx_size); sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4; dist = sse; if (x->plane[plane].eobs[block]) { @@ -560,8 +672,9 @@ static void block_rd_txfm(int plane, int block, int row, int col, } } else { // full forward transform and quantization - vp9_xform_quant(x, plane, block, row, col, plane_bsize, tx_size); - dist_block(x, plane, block, tx_size, &dist, &sse); + vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size); + dist_block(args->cpi, x, plane, block, blk_row, blk_col, tx_size, &dist, + &sse); } rd = RDCOST(x->rdmult, x->rddiv, 0, dist); @@ -570,7 +683,7 @@ static void block_rd_txfm(int plane, int block, int row, int col, return; } - rate = rate_block(plane, block, row, col, tx_size, args); + rate = rate_block(plane, block, blk_row, blk_col, tx_size, args); rd1 = RDCOST(x->rdmult, x->rddiv, rate, dist); rd2 = RDCOST(x->rdmult, x->rddiv, 0, sse); @@ -593,16 +706,15 @@ static void block_rd_txfm(int plane, int block, int row, int col, args->skippable &= !x->plane[plane].eobs[block]; } -static void txfm_rd_in_plane(MACROBLOCK *x, - int *rate, int64_t *distortion, - int *skippable, int64_t *sse, - int64_t ref_best_rd, int plane, - BLOCK_SIZE bsize, TX_SIZE tx_size, - int use_fast_coef_casting) { +static void txfm_rd_in_plane(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, + int64_t *distortion, int *skippable, int64_t *sse, + int64_t ref_best_rd, int plane, BLOCK_SIZE bsize, + TX_SIZE tx_size, int use_fast_coef_casting) { MACROBLOCKD *const xd = &x->e_mbd; const struct macroblockd_plane *const pd = &xd->plane[plane]; struct rdcost_block_args args; vp9_zero(args); + args.cpi = cpi; args.x = x; args.best_rd = ref_best_rd; args.use_fast_coef_costing = use_fast_coef_casting; @@ -643,8 +755,7 @@ static void choose_largest_tx_size(VP9_COMP *cpi, MACROBLOCK *x, mi->tx_size = VPXMIN(max_tx_size, largest_tx_size); - txfm_rd_in_plane(x, rate, distortion, skip, - sse, ref_best_rd, 0, bs, + txfm_rd_in_plane(cpi, x, rate, distortion, skip, sse, ref_best_rd, 0, bs, mi->tx_size, cpi->sf.use_fast_coef_costing); } @@ -695,9 +806,8 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, else r_tx_size += vp9_cost_one(tx_probs[m]); } - txfm_rd_in_plane(x, &r[n][0], &d[n], &s[n], - &sse[n], ref_best_rd, 0, bs, n, - cpi->sf.use_fast_coef_costing); + txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0, + bs, n, cpi->sf.use_fast_coef_costing); r[n][1] = r[n][0]; if (r[n][0] < INT_MAX) { r[n][1] += r_tx_size; @@ -1172,9 +1282,8 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, *skippable = 1; for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse, - ref_best_rd, plane, bsize, uv_tx_size, - cpi->sf.use_fast_coef_costing); + txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd, + plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing); if (pnrate == INT_MAX) { is_cost_valid = 0; break; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index e7f04a244..bc95ae065 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -162,6 +162,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V; + sf->txfm_domain_distortion = 1; } if (speed >= 2) { @@ -279,6 +280,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->exhaustive_searches_thresh = INT_MAX; if (speed >= 1) { + sf->txfm_domain_distortion = 1; sf->use_square_partition_only = !frame_is_intra_only(cm); sf->less_rectangular_check = 1; sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD @@ -541,6 +543,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->disable_filter_search_var_thresh = 0; sf->adaptive_interp_filter_search = 0; sf->allow_partition_search_skip = 0; + sf->txfm_domain_distortion = 0; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index e88a7dfff..2cbf021f8 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -246,6 +246,11 @@ typedef struct SPEED_FEATURES { // Coefficient probability model approximation step size int coeff_prob_appx_step; + // Use transform domain distortion. Use pixel domain distortion when + // this flag is set to be zero. The pixel domain distortion computation + // improves the distortion metric precision. + int txfm_domain_distortion; + // The threshold is to determine how slow the motino is, it is used when // use_lastframe_partitioning is set to LAST_FRAME_PARTITION_LOW_MOTION MOTION_THRESHOLD lf_motion_threshold; diff --git a/vpx_dsp/sum_squares.c b/vpx_dsp/sum_squares.c new file mode 100644 index 000000000..7c535ac2d --- /dev/null +++ b/vpx_dsp/sum_squares.c @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride, + int size) { + int r, c; + uint64_t ss = 0; + + for (r = 0; r < size; r++) { + for (c = 0; c < size; c++) { + const int16_t v = src[c]; + ss += v * v; + } + src += src_stride; + } + + return ss; +} diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 018126d4b..828536c29 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -277,6 +277,8 @@ endif # CONFIG_VP9_ENCODER ifeq ($(CONFIG_ENCODERS),yes) DSP_SRCS-yes += sad.c DSP_SRCS-yes += subtract.c +DSP_SRCS-yes += sum_squares.c +DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM) DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 37239a195..38fc09734 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1169,6 +1169,9 @@ specialize qw/vpx_sad4x8x4d msa/, "$sse2_x86inc"; add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array"; specialize qw/vpx_sad4x4x4d msa/, "$sse2_x86inc"; +add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; +specialize qw/vpx_sum_squares_2d_i16 sse2/; + # # Structured Similarity (SSIM) # diff --git a/vpx_dsp/x86/sum_squares_sse2.c b/vpx_dsp/x86/sum_squares_sse2.c new file mode 100644 index 000000000..bc5362e10 --- /dev/null +++ b/vpx_dsp/x86/sum_squares_sse2.c @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include + +#include "./vpx_dsp_rtcd.h" + +static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src, + int stride) { + const __m128i v_val_0_w = + _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); + const __m128i v_val_1_w = + _mm_loadl_epi64((const __m128i *)(src + 1 * stride)); + const __m128i v_val_2_w = + _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); + const __m128i v_val_3_w = + _mm_loadl_epi64((const __m128i *)(src + 3 * stride)); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + + const __m128i v_sum_d = + _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); + + return (uint64_t)_mm_cvtsi128_si32(v_sum_d); +} + +// TODO(jingning): Evaluate the performance impact here. +#ifdef __GNUC__ +// This prevents GCC/Clang from inlining this function into +// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack +// maintenance instructions in the common case of 4x4. +__attribute__((noinline)) +#endif +static uint64_t +vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) { + int r, c; + const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + __m128i v_acc_q = _mm_setzero_si128(); + + for (r = 0; r < size; r += 8) { + __m128i v_acc_d = _mm_setzero_si128(); + + for (c = 0; c < size; c += 8) { + const int16_t *b = src + c; + const __m128i v_val_0_w = + _mm_load_si128((const __m128i *)(b + 0 * stride)); + const __m128i v_val_1_w = + _mm_load_si128((const __m128i *)(b + 1 * stride)); + const __m128i v_val_2_w = + _mm_load_si128((const __m128i *)(b + 2 * stride)); + const __m128i v_val_3_w = + _mm_load_si128((const __m128i *)(b + 3 * stride)); + const __m128i v_val_4_w = + _mm_load_si128((const __m128i *)(b + 4 * stride)); + const __m128i v_val_5_w = + _mm_load_si128((const __m128i *)(b + 5 * stride)); + const __m128i v_val_6_w = + _mm_load_si128((const __m128i *)(b + 6 * stride)); + const __m128i v_val_7_w = + _mm_load_si128((const __m128i *)(b + 7 * stride)); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); + const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); + const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); + const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); + const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); + + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); + } + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); + + src += 8 * stride; + } + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + +#if ARCH_X86_64 + return (uint64_t)_mm_cvtsi128_si64(v_acc_q); +#else + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_acc_q); + return tmp; + } +#endif +} + +uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) { + // 4 elements per row only requires half an XMM register, so this + // must be a special case, but also note that over 75% of all calls + // are with size == 4, so it is also the common case. + if (size == 4) { + return vpx_sum_squares_2d_i16_4x4_sse2(src, stride); + } else { + // Generic case + return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size); + } +} -- 2.40.0