From: Yunqing Wang Date: Fri, 22 Jul 2011 20:01:11 +0000 (-0400) Subject: Preload reference area in sub-pixel motion search (real-time mode) X-Git-Tag: v0.9.7~15 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2f2302f8d5e41862e517f34544f9bcd8c8edeaad;p=libvpx Preload reference area in sub-pixel motion search (real-time mode) This change implemented same idea in change "Preload reference area to an intermediate buffer in sub-pixel motion search." The changes were made to vp8_find_best_sub_pixel_step() and vp8_find_best_half _pixel_step() functions which are called when speed >= 5. Test result (using tulip clip): 1. On Core2 Quad machine(Linux) rt mode, speed (-5 ~ -8), encoding speed gain: 2% ~ 3% rt mode, speed (-9 ~ -11), encoding speed gain: 1% ~ 2% rt mode, speed (-12 ~ -14), no noticeable encoding speed gain 2. On Xeon machine(Linux) Test on speed (-5 ~ -14) didn't show noticeable speed change. Change-Id: I21bec2d6e7fbe541fcc0f4c0366bbdf3e2076aa2 --- diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 282e5c91b..ff84306e7 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -343,12 +343,26 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int bestmse = INT_MAX; int_mv startmv; int_mv this_mv; - unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col; unsigned char *z = (*(b->base_src) + b->src); int left, right, up, down, diag; unsigned int sse; int whichdir ; int thismse; + int y_stride; + +#if ARCH_X86 || ARCH_X86_64 + MACROBLOCKD *xd = &x->e_mbd; + unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col; + unsigned char *y; + + y_stride = 32; + /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */ + vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18); + y = xd->y_buf + y_stride + 1; +#else + unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col; + y_stride = d->pre_stride; +#endif // central mv bestmv->as_mv.row <<= 3; @@ -356,14 +370,14 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, startmv = *bestmv; // calculate central point error - bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1); + bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1); *distortion = bestmse; bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); // go left then right and check error this_mv.as_mv.row = startmv.as_mv.row; this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4); - thismse = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse); left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (left < bestmse) @@ -375,7 +389,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } this_mv.as_mv.col += 8; - thismse = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse); right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (right < bestmse) @@ -389,7 +403,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, // go up then down and check error this_mv.as_mv.col = startmv.as_mv.col; this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4); - thismse = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse); up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (up < bestmse) @@ -401,7 +415,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } this_mv.as_mv.row += 8; - thismse = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse); down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (down < bestmse) @@ -424,23 +438,23 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, case 0: this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4; this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4; - thismse = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse); break; case 1: this_mv.as_mv.col += 4; this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4; - thismse = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse); break; case 2: this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4; this_mv.as_mv.row += 4; - thismse = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse); break; case 3: default: this_mv.as_mv.col += 4; this_mv.as_mv.row += 4; - thismse = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse); break; } @@ -459,7 +473,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, // time to check quarter pels. if (bestmv->as_mv.row < startmv.as_mv.row) - y -= d->pre_stride; + y -= y_stride; if (bestmv->as_mv.col < startmv.as_mv.col) y--; @@ -474,12 +488,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, if (startmv.as_mv.col & 7) { this_mv.as_mv.col = startmv.as_mv.col - 2; - thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); } else { this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6; - thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse); } left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); @@ -493,7 +507,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } this_mv.as_mv.col += 4; - thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (right < bestmse) @@ -510,12 +524,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, if (startmv.as_mv.row & 7) { this_mv.as_mv.row = startmv.as_mv.row - 2; - thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); } else { this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6; - thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse); + thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse); } up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); @@ -529,7 +543,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, } this_mv.as_mv.row += 4; - thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (down < bestmse) @@ -559,12 +573,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, if (startmv.as_mv.col & 7) { this_mv.as_mv.col -= 2; - thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); } else { this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6; - thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);; + thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse);; } } else @@ -574,12 +588,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, if (startmv.as_mv.col & 7) { this_mv.as_mv.col -= 2; - thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse); + thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse); } else { this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6; - thismse = vfp->svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse); + thismse = vfp->svf(y - y_stride - 1, y_stride, 6, 6, z, b->src_stride, &sse); } } @@ -590,12 +604,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, if (startmv.as_mv.row & 7) { this_mv.as_mv.row -= 2; - thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); } else { this_mv.as_mv.row = (startmv.as_mv.row - 8) | 6; - thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse); + thismse = vfp->svf(y - y_stride, y_stride, this_mv.as_mv.col & 7, 6, z, b->src_stride, &sse); } break; @@ -605,19 +619,19 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, if (startmv.as_mv.col & 7) { this_mv.as_mv.col -= 2; - thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); } else { this_mv.as_mv.col = (startmv.as_mv.col - 8) | 6; - thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y - 1, y_stride, 6, this_mv.as_mv.row & 7, z, b->src_stride, &sse); } break; case 3: this_mv.as_mv.col += 2; this_mv.as_mv.row += 2; - thismse = vfp->svf(y, d->pre_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, y_stride, this_mv.as_mv.col & 7, this_mv.as_mv.row & 7, z, b->src_stride, &sse); break; } @@ -634,7 +648,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, return bestmse; } -int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, +int vp8_find_best_half_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, @@ -644,11 +658,25 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, int bestmse = INT_MAX; int_mv startmv; int_mv this_mv; - unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col; unsigned char *z = (*(b->base_src) + b->src); int left, right, up, down, diag; unsigned int sse; int thismse; + int y_stride; + +#if ARCH_X86 || ARCH_X86_64 + MACROBLOCKD *xd = &x->e_mbd; + unsigned char *y0 = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col; + unsigned char *y; + + y_stride = 32; + /* Copy 18 rows x 32 cols area to intermediate buffer before searching. */ + vfp->copymem(y0 - 1 - d->pre_stride, d->pre_stride, xd->y_buf, y_stride, 18); + y = xd->y_buf + y_stride + 1; +#else + unsigned char *y = *(d->base_pre) + d->pre + (bestmv->as_mv.row) * d->pre_stride + bestmv->as_mv.col; + y_stride = d->pre_stride; +#endif // central mv bestmv->as_mv.row <<= 3; @@ -656,14 +684,14 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, startmv = *bestmv; // calculate central point error - bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1); + bestmse = vfp->vf(y, y_stride, z, b->src_stride, sse1); *distortion = bestmse; bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); // go left then right and check error this_mv.as_mv.row = startmv.as_mv.row; this_mv.as_mv.col = ((startmv.as_mv.col - 8) | 4); - thismse = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_h(y - 1, y_stride, z, b->src_stride, &sse); left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (left < bestmse) @@ -675,7 +703,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, } this_mv.as_mv.col += 8; - thismse = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_h(y, y_stride, z, b->src_stride, &sse); right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (right < bestmse) @@ -689,7 +717,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, // go up then down and check error this_mv.as_mv.col = startmv.as_mv.col; this_mv.as_mv.row = ((startmv.as_mv.row - 8) | 4); - thismse = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_v(y - y_stride, y_stride, z, b->src_stride, &sse); up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (up < bestmse) @@ -701,7 +729,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, } this_mv.as_mv.row += 8; - thismse = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_v(y, y_stride, z, b->src_stride, &sse); down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (down < bestmse) @@ -723,22 +751,22 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, case 0: this_mv.col = (this_mv.col - 8) | 4; this_mv.row = (this_mv.row - 8) | 4; - diag = vfp->svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse); + diag = vfp->svf(y - 1 - y_stride, y_stride, 4, 4, z, b->src_stride, &sse); break; case 1: this_mv.col += 4; this_mv.row = (this_mv.row - 8) | 4; - diag = vfp->svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse); + diag = vfp->svf(y - y_stride, y_stride, 4, 4, z, b->src_stride, &sse); break; case 2: this_mv.col = (this_mv.col - 8) | 4; this_mv.row += 4; - diag = vfp->svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse); + diag = vfp->svf(y - 1, y_stride, 4, 4, z, b->src_stride, &sse); break; case 3: this_mv.col += 4; this_mv.row += 4; - diag = vfp->svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse); + diag = vfp->svf(y, y_stride, 4, 4, z, b->src_stride, &sse); break; } @@ -753,7 +781,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, #else this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4; this_mv.as_mv.row = (this_mv.as_mv.row - 8) | 4; - thismse = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_hv(y - 1 - y_stride, y_stride, z, b->src_stride, &sse); diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (diag < bestmse) @@ -765,7 +793,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, } this_mv.as_mv.col += 8; - thismse = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_hv(y - y_stride, y_stride, z, b->src_stride, &sse); diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (diag < bestmse) @@ -778,7 +806,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, this_mv.as_mv.col = (this_mv.as_mv.col - 8) | 4; this_mv.as_mv.row = startmv.as_mv.row + 4; - thismse = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_hv(y - 1, y_stride, z, b->src_stride, &sse); diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (diag < bestmse) @@ -790,7 +818,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, } this_mv.as_mv.col += 8; - thismse = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_hv(y, y_stride, z, b->src_stride, &sse); diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (diag < bestmse) diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index f5f95b13b..992df71c2 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1181,6 +1181,9 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, // Should we do a full search (best quality only) if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000) { + /* Check if mvp_full is within the range. */ + vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); + thissme = cpi->full_search_sad(x, c, e, &mvp_full, sadpb, 16, v_fn_ptr, x->mvcost, bsi->ref_mv);