From 8c411f74e000457b43bc75093b5908819b34e394 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Mon, 23 Mar 2015 10:02:42 -0700 Subject: [PATCH] Hadamard transform based coding mode decision process This commit uses Hadamard transform based rate-distortion cost estimate for rtc coding mode decision. It improves the compression performance of speed -6 for many hard clips at lower bit-rates. For example, 5.5% for jimredvga, 6.7% for mmmoving, 6.1% for niklas720p. This will introduce extra encoding cycle costs at this point. Change-Id: Iaf70634fa2417a705ee29f2456175b981db3d375 --- vp9/common/vp9_rtcd_defs.pl | 9 ++ vp9/encoder/vp9_avg.c | 81 +++++++++++ vp9/encoder/vp9_pickmode.c | 197 +++++++++++++++++++++++--- vp9/encoder/x86/vp9_avg_intrin_sse2.c | 135 ++++++++++++++++++ 4 files changed, 400 insertions(+), 22 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 134481356..ed43010e3 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1109,6 +1109,15 @@ specialize qw/vp9_avg_8x8 sse2 neon/; add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p"; specialize qw/vp9_avg_4x4 sse2/; +add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff"; +specialize qw/vp9_hadamard_8x8 sse2/; + +add_proto qw/void vp9_hadamard_16x16/, "int16_t *coeff"; +specialize qw/vp9_hadamard_16x16/; + +add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length"; +specialize qw/vp9_satd sse2/; + add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height"; specialize qw/vp9_int_pro_row sse2/; diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c index dc5cfe268..b5632a20d 100644 --- a/vp9/encoder/vp9_avg.c +++ b/vp9/encoder/vp9_avg.c @@ -28,6 +28,87 @@ unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) { return (sum + 8) >> 4; } +static void hadamard_col8(const int16_t *src_diff, int src_stride, + int16_t *coeff) { + int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; + int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; + int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; + int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; + int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; + int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; + int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; + int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; + + int16_t c0 = b0 + b2; + int16_t c1 = b1 + b3; + int16_t c2 = b0 - b2; + int16_t c3 = b1 - b3; + int16_t c4 = b4 + b6; + int16_t c5 = b5 + b7; + int16_t c6 = b4 - b6; + int16_t c7 = b5 - b7; + + coeff[0] = c0 + c4; + coeff[7] = c1 + c5; + coeff[3] = c2 + c6; + coeff[4] = c3 + c7; + coeff[2] = c0 - c4; + coeff[6] = c1 - c5; + coeff[1] = c2 - c6; + coeff[5] = c3 - c7; +} + +void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, + int16_t *coeff) { + int idx; + int16_t buffer[64]; + int16_t *tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + hadamard_col8(src_diff, src_stride, tmp_buf); + tmp_buf += 8; + ++src_diff; + } + + tmp_buf = &buffer[0]; + for (idx = 0; idx < 8; ++idx) { + hadamard_col8(tmp_buf, 8, coeff); + coeff += 8; + ++tmp_buf; + } +} + +// In place 16x16 2D Hadamard transform +void vp9_hadamard_16x16_c(int16_t *coeff) { + int idx; + for (idx = 0; idx < 64; ++idx) { + int16_t a0 = coeff[0]; + int16_t a1 = coeff[64]; + int16_t a2 = coeff[128]; + int16_t a3 = coeff[192]; + + int16_t b0 = a0 + a1; + int16_t b1 = a0 - a1; + int16_t b2 = a2 + a3; + int16_t b3 = a2 - a3; + + coeff[0] = (b0 + b2) >> 1; + coeff[64] = (b1 + b3) >> 1; + coeff[128] = (b0 - b2) >> 1; + coeff[192] = (b1 - b3) >> 1; + + ++coeff; + } +} + +int16_t vp9_satd_c(const int16_t *coeff, int length) { + int i; + int satd = 0; + for (i = 0; i < length; ++i) + satd += abs(coeff[i]); + + return (int16_t)satd; +} + // Integer projection onto row vectors. void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height) { diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index ad895e7f1..9a49484ab 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -20,9 +20,11 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_mvref_common.h" +#include "vp9/common/vp9_pred_common.h" #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" +#include "vp9/encoder/vp9_cost.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_pickmode.h" #include "vp9/encoder/vp9_ratectrl.h" @@ -188,6 +190,8 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, cond_cost_list(cpi, cost_list), x->nmvjointcost, x->mvcost, &dis, &x->pred_sse[ref], NULL, 0, 0); + *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv, + x->nmvjointcost, x->mvcost, MV_COST_WEIGHT); } if (scaled_ref_frame) { @@ -198,7 +202,6 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x, return rv; } - static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, @@ -312,6 +315,105 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, *out_dist_sum += dist << 4; } +static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, + int *skippable, int64_t *sse, int plane, + BLOCK_SIZE bsize, TX_SIZE tx_size) { + MACROBLOCKD *xd = &x->e_mbd; + const struct macroblockd_plane *pd = &xd->plane[plane]; + const struct macroblock_plane *const p = &x->plane[plane]; + const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize]; + const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; + const int step = 1 << (tx_size << 1); + const int block_step = (1 << tx_size); + int block = 0, r, c; + int shift = tx_size == TX_32X32 ? 0 : 2; + const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 : + xd->mb_to_right_edge >> (5 + pd->subsampling_x)); + const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 : + xd->mb_to_bottom_edge >> (5 + pd->subsampling_y)); + +#if CONFIG_VP9_HIGHBITDEPTH + unsigned int var_y, sse_y; + model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y); + *sse = INT_MAX; + *skippable = 0; + return; +#else + (void)cpi; +#endif + + vp9_subtract_plane(x, bsize, plane); + + *skippable = 1; + *rate = 0; + *dist = 0; + *sse = 0; + + // Keep track of the row and column of the blocks we use so that we know + // if we are in the unrestricted motion border. + for (r = 0; r < max_blocks_high; r += block_step) { + for (c = 0; c < num_4x4_w; c += block_step) { + if (c < max_blocks_wide) { + const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block); + tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block); + tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); + uint16_t *const eob = &p->eobs[block]; + const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize]; + int i, j; + const int16_t *src_diff; + int64_t this_sse; + txfrm_block_to_raster_xy(bsize, tx_size, block, &i, &j); + src_diff = &p->src_diff[4 * (j * diff_stride + i)]; + + switch (tx_size) { + case TX_32X32: + vp9_fdct32x32_rd(src_diff, coeff, diff_stride); + vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, + p->round_fp, p->quant_fp, p->quant_shift, + qcoeff, dqcoeff, pd->dequant, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_16X16: + vp9_fdct16x16(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_8X8: + vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff); + vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, eob, + scan_order->scan, scan_order->iscan); + break; + case TX_4X4: + x->fwd_txm4x4(src_diff, coeff, diff_stride); + vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp, + p->quant_fp, p->quant_shift, qcoeff, dqcoeff, + pd->dequant, eob, + scan_order->scan, scan_order->iscan); + break; + default: + assert(0); + break; + } + + *dist += vp9_block_error(coeff, dqcoeff, step << 4, &this_sse) >> shift; + *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4); + + *sse += (this_sse >> shift); + *skippable &= (*eob == 0); + } + block += step; + } + } + + *rate <<= 8; + *rate *= 6; +} + static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x, MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum, @@ -518,7 +620,9 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, int i, j; int rate; int64_t dist; - unsigned int var_y, sse_y; + int64_t this_sse; + int is_skippable; + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); assert(plane == 0); (void) plane; @@ -533,8 +637,16 @@ static void estimate_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, pd->dst.buf, dst_stride, i, j, 0); - // This procedure assumes zero offset from p->src.buf and pd->dst.buf. - model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y); + + // TODO(jingning): This needs further refactoring. + block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0, + bsize_tx, tx_size); + x->skip_txfm[0] = is_skippable; + if (is_skippable) + rate = vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 1); + else + rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), 0); + p->src.buf = src_buf_base; pd->dst.buf = dst_buf_base; args->rate += rate; @@ -602,10 +714,6 @@ void vp9_pick_intra_mode(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost, *rd_cost = best_rdc; } -static const int ref_frame_cost[MAX_REF_FRAMES] = { - 1235, 229, 530, 615, -}; - typedef struct { MV_REFERENCE_FRAME ref_frame; PREDICTION_MODE pred_mode; @@ -682,6 +790,20 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int ref_frame_skip_mask = 0; int idx; int best_pred_sad = INT_MAX; + int ref_frame_cost[MAX_REF_FRAMES]; + vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd); + vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd); + vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd); + + ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0); + ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] = + ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1); + + ref_frame_cost[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0); + ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1); + ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1); + ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0); + ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1); if (reuse_inter_pred) { int i; @@ -773,6 +895,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int mode_index; int i; PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode; + int64_t this_sse; + int is_skippable; + if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue; @@ -935,17 +1060,40 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, vp9_get_switchable_rate(cpi, xd) : 0; } + // TODO(jingning): disable color operations temporarily. // chroma component rate-distortion cost modeling - if (x->color_sensitivity[0] || x->color_sensitivity[1]) { - int uv_rate = 0; - int64_t uv_dist = 0; - if (x->color_sensitivity[0]) - vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1); - if (x->color_sensitivity[1]) - vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2); - model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist, &var_y, &sse_y); - this_rdc.rate += uv_rate; - this_rdc.dist += uv_dist; +// if (x->color_sensitivity[0] || x->color_sensitivity[1]) { +// int uv_rate = 0; +// int64_t uv_dist = 0; +// if (x->color_sensitivity[0]) +// vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1); +// if (x->color_sensitivity[1]) +// vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2); +// model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist, +// &var_y, &sse_y); +// this_rdc.rate += uv_rate; +// this_rdc.dist += uv_dist; +// } + + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable, &this_sse, + 0, bsize, mbmi->tx_size); + x->skip_txfm[0] = is_skippable; + if (is_skippable) { + this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + } else { + if (RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist) < + RDCOST(x->rdmult, x->rddiv, 0, this_sse)) { + this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0); + } else { + this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1); + this_rdc.dist = this_sse; + } + } + + if (cm->interp_filter == SWITCHABLE) { + if ((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07) + this_rdc.rate += vp9_get_switchable_rate(cpi, xd); } this_rdc.rate += rate_mv; @@ -1042,6 +1190,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, const PREDICTION_MODE this_mode = intra_mode_list[i]; if (!((1 << this_mode) & cpi->sf.intra_y_mode_mask[intra_tx_size])) continue; + mbmi->mode = this_mode; + mbmi->ref_frame[0] = INTRA_FRAME; args.mode = this_mode; args.rate = 0; args.dist = 0; @@ -1058,17 +1208,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (this_rdc.rdcost < best_rdc.rdcost) { best_rdc = this_rdc; - mbmi->mode = this_mode; + best_mode = this_mode; best_intra_tx_size = mbmi->tx_size; - mbmi->ref_frame[0] = INTRA_FRAME; + best_ref_frame = INTRA_FRAME; mbmi->uv_mode = this_mode; mbmi->mv[0].as_int = INVALID_MV; + best_mode_skip_txfm = x->skip_txfm[0]; } } // Reset mb_mode_info to the best inter mode. - if (mbmi->ref_frame[0] != INTRA_FRAME) { - x->skip_txfm[0] = best_mode_skip_txfm; + if (best_ref_frame != INTRA_FRAME) { mbmi->tx_size = best_tx_size; } else { mbmi->tx_size = best_intra_tx_size; @@ -1076,6 +1226,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } pd->dst = orig_dst; + mbmi->mode = best_mode; + mbmi->ref_frame[0] = best_ref_frame; + x->skip_txfm[0] = best_mode_skip_txfm; if (reuse_inter_pred && best_pred != NULL) { if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) { diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index 4e80b255e..3f78b8a0f 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -57,6 +57,141 @@ unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) { return (avg + 8) >> 4; } +static void hadamard_col8_sse2(__m128i *in, int iter) { + __m128i a0 = in[0]; + __m128i a1 = in[1]; + __m128i a2 = in[2]; + __m128i a3 = in[3]; + __m128i a4 = in[4]; + __m128i a5 = in[5]; + __m128i a6 = in[6]; + __m128i a7 = in[7]; + + __m128i b0 = _mm_add_epi16(a0, a1); + __m128i b1 = _mm_sub_epi16(a0, a1); + __m128i b2 = _mm_add_epi16(a2, a3); + __m128i b3 = _mm_sub_epi16(a2, a3); + __m128i b4 = _mm_add_epi16(a4, a5); + __m128i b5 = _mm_sub_epi16(a4, a5); + __m128i b6 = _mm_add_epi16(a6, a7); + __m128i b7 = _mm_sub_epi16(a6, a7); + + a0 = _mm_add_epi16(b0, b2); + a1 = _mm_add_epi16(b1, b3); + a2 = _mm_sub_epi16(b0, b2); + a3 = _mm_sub_epi16(b1, b3); + a4 = _mm_add_epi16(b4, b6); + a5 = _mm_add_epi16(b5, b7); + a6 = _mm_sub_epi16(b4, b6); + a7 = _mm_sub_epi16(b5, b7); + + if (iter == 0) { + b0 = _mm_add_epi16(a0, a4); + b1 = _mm_add_epi16(a1, a5); + b2 = _mm_add_epi16(a2, a6); + b3 = _mm_add_epi16(a3, a7); + b4 = _mm_sub_epi16(a0, a4); + b5 = _mm_sub_epi16(a1, a5); + b6 = _mm_sub_epi16(a2, a6); + b7 = _mm_sub_epi16(a3, a7); + + a0 = _mm_unpacklo_epi16(b0, b1); + a1 = _mm_unpacklo_epi16(b2, b3); + a2 = _mm_unpackhi_epi16(b0, b1); + a3 = _mm_unpackhi_epi16(b2, b3); + a4 = _mm_unpacklo_epi16(b4, b5); + a5 = _mm_unpacklo_epi16(b6, b7); + a6 = _mm_unpackhi_epi16(b4, b5); + a7 = _mm_unpackhi_epi16(b6, b7); + + b0 = _mm_unpacklo_epi32(a0, a1); + b1 = _mm_unpacklo_epi32(a4, a5); + b2 = _mm_unpackhi_epi32(a0, a1); + b3 = _mm_unpackhi_epi32(a4, a5); + b4 = _mm_unpacklo_epi32(a2, a3); + b5 = _mm_unpacklo_epi32(a6, a7); + b6 = _mm_unpackhi_epi32(a2, a3); + b7 = _mm_unpackhi_epi32(a6, a7); + + in[0] = _mm_unpacklo_epi64(b0, b1); + in[7] = _mm_unpackhi_epi64(b0, b1); + in[3] = _mm_unpacklo_epi64(b2, b3); + in[4] = _mm_unpackhi_epi64(b2, b3); + in[2] = _mm_unpacklo_epi64(b4, b5); + in[6] = _mm_unpackhi_epi64(b4, b5); + in[1] = _mm_unpacklo_epi64(b6, b7); + in[5] = _mm_unpackhi_epi64(b6, b7); + } else { + in[0] = _mm_add_epi16(a0, a4); + in[7] = _mm_add_epi16(a1, a5); + in[3] = _mm_add_epi16(a2, a6); + in[4] = _mm_add_epi16(a3, a7); + in[2] = _mm_sub_epi16(a0, a4); + in[6] = _mm_sub_epi16(a1, a5); + in[1] = _mm_sub_epi16(a2, a6); + in[5] = _mm_sub_epi16(a3, a7); + } +} + +void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, + int16_t *coeff) { + __m128i src[8]; + src[0] = _mm_load_si128((const __m128i *)src_diff); + src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); + + hadamard_col8_sse2(src, 0); + hadamard_col8_sse2(src, 1); + + _mm_storeu_si128((__m128i *)coeff, src[0]); + coeff += 8; + _mm_storeu_si128((__m128i *)coeff, src[1]); + coeff += 8; + _mm_storeu_si128((__m128i *)coeff, src[2]); + coeff += 8; + _mm_storeu_si128((__m128i *)coeff, src[3]); + coeff += 8; + _mm_storeu_si128((__m128i *)coeff, src[4]); + coeff += 8; + _mm_storeu_si128((__m128i *)coeff, src[5]); + coeff += 8; + _mm_storeu_si128((__m128i *)coeff, src[6]); + coeff += 8; + _mm_storeu_si128((__m128i *)coeff, src[7]); +} + +int16_t vp9_satd_sse2(const int16_t *coeff, int length) { + int i; + __m128i sum = _mm_load_si128((const __m128i *)coeff); + __m128i sign = _mm_srai_epi16(sum, 15); + __m128i val = _mm_xor_si128(sum, sign); + sum = _mm_sub_epi16(val, sign); + coeff += 8; + + for (i = 8; i < length; i += 8) { + __m128i src_line = _mm_load_si128((const __m128i *)coeff); + sign = _mm_srai_epi16(src_line, 15); + val = _mm_xor_si128(src_line, sign); + val = _mm_sub_epi16(val, sign); + sum = _mm_add_epi16(sum, val); + coeff += 8; + } + + val = _mm_srli_si128(sum, 8); + sum = _mm_add_epi16(sum, val); + val = _mm_srli_epi64(sum, 32); + sum = _mm_add_epi16(sum, val); + val = _mm_srli_epi32(sum, 16); + sum = _mm_add_epi16(sum, val); + + return _mm_extract_epi16(sum, 0); +} + void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, const int ref_stride, const int height) { int idx; -- 2.40.0