From 71ecb5d7d905d1f1771b6c5e130e873dcf458b73 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Wed, 27 Oct 2010 08:45:24 -0400 Subject: [PATCH] Full search SAD function optimization in SSE4.1 Use mpsadbw, and calculate 8 sad at once. Function list: vp8_sad16x16x8_sse4 vp8_sad16x8x8_sse4 vp8_sad8x16x8_sse4 vp8_sad8x8x8_sse4 vp8_sad4x4x8_sse4 (test clip: tulip) For best quality mode, this gave encoder a 5% performance boost. For good quality mode with speed=1, this gave encoder a 3% performance boost. Change-Id: I083b5a39d39144f88dcbccbef95da6498e490134 --- build/make/configure.sh | 1 + configure | 1 + vp8/encoder/generic/csystemdependent.c | 6 + vp8/encoder/mcomp.c | 154 ++++++++++- vp8/encoder/mcomp.h | 1 + vp8/encoder/onyx_if.c | 5 + vp8/encoder/sad_c.c | 90 +++++++ vp8/encoder/variance.h | 43 +++ vp8/encoder/x86/mcomp_x86.h | 9 + vp8/encoder/x86/sad_sse4.asm | 353 +++++++++++++++++++++++++ vp8/encoder/x86/variance_x86.h | 27 ++ vp8/encoder/x86/x86_csystemdependent.c | 23 +- vp8/vp8cx.mk | 1 + vpx_ports/x86.h | 3 + 14 files changed, 709 insertions(+), 8 deletions(-) create mode 100644 vp8/encoder/x86/sad_sse4.asm diff --git a/build/make/configure.sh b/build/make/configure.sh index cdd55ebd8..d25d6400e 100755 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -824,6 +824,7 @@ process_common_toolchain() { soft_enable sse2 soft_enable sse3 soft_enable ssse3 + soft_enable sse4_1 case ${tgt_os} in win*) diff --git a/configure b/configure index 39ef83ffa..11e086e9c 100755 --- a/configure +++ b/configure @@ -199,6 +199,7 @@ ARCH_EXT_LIST=" sse2 sse3 ssse3 + sse4_1 altivec " diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c index ae22b274e..824af5e46 100644 --- a/vp8/encoder/generic/csystemdependent.c +++ b/vp8/encoder/generic/csystemdependent.c @@ -40,6 +40,12 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_c; cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_c; + cpi->rtcd.variance.sad16x16x8 = vp8_sad16x16x8_c; + cpi->rtcd.variance.sad16x8x8 = vp8_sad16x8x8_c; + cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_c; + cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_c; + cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_c; + cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_c; cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_c; cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_c; diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 8cc63f83d..bb85afa6f 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -1323,7 +1323,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er check_here = r * mv_stride + in_what + col_min; c = col_min; - while ((c + 3) < col_max) + while ((c + 2) < col_max) { int i; @@ -1388,6 +1388,158 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er #endif +int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2]) +{ + unsigned char *what = (*(b->base_src) + b->src); + int what_stride = b->src_stride; + unsigned char *in_what; + int in_what_stride = d->pre_stride; + int mv_stride = d->pre_stride; + unsigned char *bestaddress; + MV *best_mv = &d->bmi.mv.as_mv; + MV this_mv; + int bestsad = INT_MAX; + int r, c; + + unsigned char *check_here; + unsigned int thissad; + + int ref_row = ref_mv->row >> 3; + int ref_col = ref_mv->col >> 3; + + int row_min = ref_row - distance; + int row_max = ref_row + distance; + int col_min = ref_col - distance; + int col_max = ref_col + distance; + + unsigned short sad_array8[8]; + unsigned int sad_array[3]; + + // Work out the mid point for the search + in_what = *(d->base_pre) + d->pre; + bestaddress = in_what + (ref_row * d->pre_stride) + ref_col; + + best_mv->row = ref_row; + best_mv->col = ref_col; + + // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits + if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) && + (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max)) + { + // Baseline value at the centre + bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit); + } + + // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border + if (col_min < x->mv_col_min) + col_min = x->mv_col_min; + + if (col_max > x->mv_col_max) + col_max = x->mv_col_max; + + if (row_min < x->mv_row_min) + row_min = x->mv_row_min; + + if (row_max > x->mv_row_max) + row_max = x->mv_row_max; + + for (r = row_min; r < row_max ; r++) + { + this_mv.row = r << 3; + check_here = r * mv_stride + in_what + col_min; + c = col_min; + + while ((c + 7) < col_max) + { + int i; + + fn_ptr->sdx8f(what, what_stride, check_here , in_what_stride, sad_array8); + + for (i = 0; i < 8; i++) + { + thissad = (unsigned int)sad_array8[i]; + + if (thissad < bestsad) + { + this_mv.col = c << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_mv->row = r; + best_mv->col = c; + bestaddress = check_here; + } + } + + check_here++; + c++; + } + } + + while ((c + 2) < col_max) + { + int i; + + fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array); + + for (i = 0; i < 3; i++) + { + thissad = sad_array[i]; + + if (thissad < bestsad) + { + this_mv.col = c << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_mv->row = r; + best_mv->col = c; + bestaddress = check_here; + } + } + + check_here++; + c++; + } + } + + while (c < col_max) + { + thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad); + + if (thissad < bestsad) + { + this_mv.col = c << 3; + thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); + + if (thissad < bestsad) + { + bestsad = thissad; + best_mv->row = r; + best_mv->col = c; + bestaddress = check_here; + } + } + + check_here ++; + c ++; + } + } + + this_mv.row = best_mv->row << 3; + this_mv.col = best_mv->col << 3; + + if (bestsad < INT_MAX) + return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad)) + + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + else + return INT_MAX; +} + #ifdef ENTROPY_STATS void print_mode_context(void) { diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h index 181e95822..7d6036248 100644 --- a/vp8/encoder/mcomp.h +++ b/vp8/encoder/mcomp.h @@ -93,6 +93,7 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step; typedef prototype_full_search_sad(*vp8_full_search_fn_t); extern prototype_full_search_sad(vp8_full_search_sad); extern prototype_full_search_sad(vp8_full_search_sadx3); +extern prototype_full_search_sad(vp8_full_search_sadx8); typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t); extern prototype_diamond_search_sad(vp8_diamond_search_sad); diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 7a78b2901..5f02a5a02 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2341,6 +2341,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_v); cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_hv); cpi->fn_ptr[BLOCK_16X16].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3); + cpi->fn_ptr[BLOCK_16X16].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x8); cpi->fn_ptr[BLOCK_16X16].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d); cpi->fn_ptr[BLOCK_16X8].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8); @@ -2350,6 +2351,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v = NULL; cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL; cpi->fn_ptr[BLOCK_16X8].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3); + cpi->fn_ptr[BLOCK_16X8].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x8); cpi->fn_ptr[BLOCK_16X8].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d); cpi->fn_ptr[BLOCK_8X16].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16); @@ -2359,6 +2361,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v = NULL; cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL; cpi->fn_ptr[BLOCK_8X16].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3); + cpi->fn_ptr[BLOCK_8X16].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x8); cpi->fn_ptr[BLOCK_8X16].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d); cpi->fn_ptr[BLOCK_8X8].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8); @@ -2368,6 +2371,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v = NULL; cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL; cpi->fn_ptr[BLOCK_8X8].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3); + cpi->fn_ptr[BLOCK_8X8].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x8); cpi->fn_ptr[BLOCK_8X8].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d); cpi->fn_ptr[BLOCK_4X4].sdf = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4); @@ -2377,6 +2381,7 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v = NULL; cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL; cpi->fn_ptr[BLOCK_4X4].sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3); + cpi->fn_ptr[BLOCK_4X4].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8); cpi->fn_ptr[BLOCK_4X4].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d); #if !(CONFIG_REALTIME_ONLY) diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c index 2ff122f68..5eaca5935 100644 --- a/vp8/encoder/sad_c.c +++ b/vp8/encoder/sad_c.c @@ -126,6 +126,24 @@ void vp8_sad16x16x3_c( sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); } +void vp8_sad16x16x8_c( + const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, + unsigned short *sad_array +) +{ + sad_array[0] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); + sad_array[1] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[3] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff); + sad_array[4] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff); + sad_array[5] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff); + sad_array[6] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff); + sad_array[7] = (unsigned short)vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff); +} + void vp8_sad16x8x3_c( const unsigned char *src_ptr, int src_stride, @@ -139,6 +157,24 @@ void vp8_sad16x8x3_c( sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); } +void vp8_sad16x8x8_c( + const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, + unsigned short *sad_array +) +{ + sad_array[0] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); + sad_array[1] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[3] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff); + sad_array[4] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff); + sad_array[5] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff); + sad_array[6] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff); + sad_array[7] = (unsigned short)vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff); +} + void vp8_sad8x8x3_c( const unsigned char *src_ptr, int src_stride, @@ -152,6 +188,24 @@ void vp8_sad8x8x3_c( sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); } +void vp8_sad8x8x8_c( + const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, + unsigned short *sad_array +) +{ + sad_array[0] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); + sad_array[1] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[3] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff); + sad_array[4] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff); + sad_array[5] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff); + sad_array[6] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff); + sad_array[7] = (unsigned short)vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff); +} + void vp8_sad8x16x3_c( const unsigned char *src_ptr, int src_stride, @@ -165,6 +219,24 @@ void vp8_sad8x16x3_c( sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); } +void vp8_sad8x16x8_c( + const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, + unsigned short *sad_array +) +{ + sad_array[0] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); + sad_array[1] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[3] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff); + sad_array[4] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff); + sad_array[5] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff); + sad_array[6] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff); + sad_array[7] = (unsigned short)vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff); +} + void vp8_sad4x4x3_c( const unsigned char *src_ptr, int src_stride, @@ -178,6 +250,24 @@ void vp8_sad4x4x3_c( sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); } +void vp8_sad4x4x8_c( + const unsigned char *src_ptr, + int src_stride, + const unsigned char *ref_ptr, + int ref_stride, + unsigned short *sad_array +) +{ + sad_array[0] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr , ref_stride, 0x7fffffff); + sad_array[1] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff); + sad_array[2] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff); + sad_array[3] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 3 , ref_stride, 0x7fffffff); + sad_array[4] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 4, ref_stride, 0x7fffffff); + sad_array[5] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 5, ref_stride, 0x7fffffff); + sad_array[6] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 6 , ref_stride, 0x7fffffff); + sad_array[7] = (unsigned short)vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 7, ref_stride, 0x7fffffff); +} + void vp8_sad16x16x4d_c( const unsigned char *src_ptr, int src_stride, diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h index 5b70fe54b..5befd3b86 100644 --- a/vp8/encoder/variance.h +++ b/vp8/encoder/variance.h @@ -32,6 +32,16 @@ unsigned int *sad_array\ ) +#define prototype_sad_multi_same_address_1(sym)\ + void (sym)\ + (\ + const unsigned char *src_ptr, \ + int source_stride, \ + const unsigned char *ref_ptr, \ + int ref_stride, \ + unsigned short *sad_array\ + ) + #define prototype_sad_multi_dif_address(sym)\ void (sym)\ (\ @@ -138,6 +148,31 @@ extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3); #endif extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3); +#ifndef vp8_variance_sad16x16x8 +#define vp8_variance_sad16x16x8 vp8_sad16x16x8_c +#endif +extern prototype_sad_multi_same_address_1(vp8_variance_sad16x16x8); + +#ifndef vp8_variance_sad16x8x8 +#define vp8_variance_sad16x8x8 vp8_sad16x8x8_c +#endif +extern prototype_sad_multi_same_address_1(vp8_variance_sad16x8x8); + +#ifndef vp8_variance_sad8x8x8 +#define vp8_variance_sad8x8x8 vp8_sad8x8x8_c +#endif +extern prototype_sad_multi_same_address_1(vp8_variance_sad8x8x8); + +#ifndef vp8_variance_sad8x16x8 +#define vp8_variance_sad8x16x8 vp8_sad8x16x8_c +#endif +extern prototype_sad_multi_same_address_1(vp8_variance_sad8x16x8); + +#ifndef vp8_variance_sad4x4x8 +#define vp8_variance_sad4x4x8 vp8_sad4x4x8_c +#endif +extern prototype_sad_multi_same_address_1(vp8_variance_sad4x4x8); + //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- #ifndef vp8_variance_sad16x16x4d @@ -274,6 +309,7 @@ extern prototype_sad(vp8_variance_get4x4sse_cs); typedef prototype_sad(*vp8_sad_fn_t); typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t); +typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t); typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t); typedef prototype_variance(*vp8_variance_fn_t); typedef prototype_variance2(*vp8_variance2_fn_t); @@ -317,6 +353,12 @@ typedef struct vp8_sad_multi_fn_t sad8x8x3; vp8_sad_multi_fn_t sad4x4x3; + vp8_sad_multi1_fn_t sad16x16x8; + vp8_sad_multi1_fn_t sad16x8x8; + vp8_sad_multi1_fn_t sad8x16x8; + vp8_sad_multi1_fn_t sad8x8x8; + vp8_sad_multi1_fn_t sad4x4x8; + vp8_sad_multi_d_fn_t sad16x16x4d; vp8_sad_multi_d_fn_t sad16x8x4d; vp8_sad_multi_d_fn_t sad8x16x4d; @@ -334,6 +376,7 @@ typedef struct vp8_variance_fn_t svf_halfpix_v; vp8_variance_fn_t svf_halfpix_hv; vp8_sad_multi_fn_t sdx3f; + vp8_sad_multi1_fn_t sdx8f; vp8_sad_multi_d_fn_t sdx4df; } vp8_variance_fn_ptr_t; diff --git a/vp8/encoder/x86/mcomp_x86.h b/vp8/encoder/x86/mcomp_x86.h index e8d658b39..3b7b29c21 100644 --- a/vp8/encoder/x86/mcomp_x86.h +++ b/vp8/encoder/x86/mcomp_x86.h @@ -24,5 +24,14 @@ #endif #endif +#if HAVE_SSE4_1 +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp8_search_full_search +#define vp8_search_full_search vp8_full_search_sadx8 + +#endif +#endif + #endif diff --git a/vp8/encoder/x86/sad_sse4.asm b/vp8/encoder/x86/sad_sse4.asm new file mode 100644 index 000000000..03ecec4b3 --- /dev/null +++ b/vp8/encoder/x86/sad_sse4.asm @@ -0,0 +1,353 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%macro PROCESS_16X2X8 1 +%if %1 + movdqa xmm0, XMMWORD PTR [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + movq xmm2, MMWORD PTR [rdi+16] + punpcklqdq xmm1, xmm3 + punpcklqdq xmm3, xmm2 + + movdqa xmm2, xmm1 + mpsadbw xmm1, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm1, xmm2 + paddw xmm1, xmm3 + paddw xmm1, xmm4 +%else + movdqa xmm0, XMMWORD PTR [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + movq xmm2, MMWORD PTR [rdi+16] + punpcklqdq xmm5, xmm3 + punpcklqdq xmm3, xmm2 + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm5, xmm2 + paddw xmm5, xmm3 + paddw xmm5, xmm4 + + paddw xmm1, xmm5 +%endif + movdqa xmm0, XMMWORD PTR [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + movq xmm2, MMWORD PTR [rdi+ rdx+16] + punpcklqdq xmm5, xmm3 + punpcklqdq xmm3, xmm2 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + + psrldq xmm0, 8 + movdqa xmm4, xmm3 + mpsadbw xmm3, xmm0, 0x0 + mpsadbw xmm4, xmm0, 0x5 + + paddw xmm5, xmm2 + paddw xmm5, xmm3 + paddw xmm5, xmm4 + + paddw xmm1, xmm5 +%endmacro + +%macro PROCESS_8X2X8 1 +%if %1 + movq xmm0, MMWORD PTR [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm1, xmm3 + + movdqa xmm2, xmm1 + mpsadbw xmm1, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm1, xmm2 +%else + movq xmm0, MMWORD PTR [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm5, xmm3 + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm5, xmm2 + + paddw xmm1, xmm5 +%endif + movq xmm0, MMWORD PTR [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + punpcklqdq xmm5, xmm3 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + movdqa xmm2, xmm5 + mpsadbw xmm5, xmm0, 0x0 + mpsadbw xmm2, xmm0, 0x5 + paddw xmm5, xmm2 + + paddw xmm1, xmm5 +%endmacro + +%macro PROCESS_4X2X8 1 +%if %1 + movd xmm0, [rsi] + movq xmm1, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm1, xmm3 + + mpsadbw xmm1, xmm0, 0x0 +%else + movd xmm0, [rsi] + movq xmm5, MMWORD PTR [rdi] + movq xmm3, MMWORD PTR [rdi+8] + punpcklqdq xmm5, xmm3 + + mpsadbw xmm5, xmm0, 0x0 + + paddw xmm1, xmm5 +%endif + movd xmm0, [rsi + rax] + movq xmm5, MMWORD PTR [rdi+ rdx] + movq xmm3, MMWORD PTR [rdi+ rdx+8] + punpcklqdq xmm5, xmm3 + + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rdx*2] + + mpsadbw xmm5, xmm0, 0x0 + + paddw xmm1, xmm5 +%endmacro + + +;void vp8_sad16x16x8_sse4( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array); +global sym(vp8_sad16x16x8_sse4) +sym(vp8_sad16x16x8_sse4): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + + mov rdi, arg(4) ;Results + movdqa XMMWORD PTR [rdi], xmm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_sad16x8x8_sse4( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vp8_sad16x8x8_sse4) +sym(vp8_sad16x8x8_sse4): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_16X2X8 1 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + PROCESS_16X2X8 0 + + mov rdi, arg(4) ;Results + movdqa XMMWORD PTR [rdi], xmm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_sad8x8x8_sse4( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vp8_sad8x8x8_sse4) +sym(vp8_sad8x8x8_sse4): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + + mov rdi, arg(4) ;Results + movdqa XMMWORD PTR [rdi], xmm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_sad8x16x8_sse4( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vp8_sad8x16x8_sse4) +sym(vp8_sad8x16x8_sse4): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_8X2X8 1 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + PROCESS_8X2X8 0 + mov rdi, arg(4) ;Results + movdqa XMMWORD PTR [rdi], xmm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;void vp8_sad4x4x8_c( +; const unsigned char *src_ptr, +; int src_stride, +; const unsigned char *ref_ptr, +; int ref_stride, +; unsigned short *sad_array +;); +global sym(vp8_sad4x4x8_sse4) +sym(vp8_sad4x4x8_sse4): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + PROCESS_4X2X8 1 + PROCESS_4X2X8 0 + + mov rdi, arg(4) ;Results + movdqa XMMWORD PTR [rdi], xmm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + + diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h index 2b62b5f7e..6bea15ebc 100644 --- a/vp8/encoder/x86/variance_x86.h +++ b/vp8/encoder/x86/variance_x86.h @@ -297,4 +297,31 @@ extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3); #endif #endif + +#if HAVE_SSE4_1 +extern prototype_sad_multi_same_address_1(vp8_sad16x16x8_sse4); +extern prototype_sad_multi_same_address_1(vp8_sad16x8x8_sse4); +extern prototype_sad_multi_same_address_1(vp8_sad8x16x8_sse4); +extern prototype_sad_multi_same_address_1(vp8_sad8x8x8_sse4); +extern prototype_sad_multi_same_address_1(vp8_sad4x4x8_sse4); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_variance_sad16x16x8 +#define vp8_variance_sad16x16x8 vp8_sad16x16x8_sse4 + +#undef vp8_variance_sad16x8x8 +#define vp8_variance_sad16x8x8 vp8_sad16x8x8_sse4 + +#undef vp8_variance_sad8x16x8 +#define vp8_variance_sad8x16x8 vp8_sad8x16x8_sse4 + +#undef vp8_variance_sad8x8x8 +#define vp8_variance_sad8x8x8 vp8_sad8x8x8_sse4 + +#undef vp8_variance_sad4x4x8 +#define vp8_variance_sad4x4x8 vp8_sad4x4x8_sse4 + +#endif +#endif + #endif diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 2581c337f..ed0e71ed0 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -188,6 +188,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) int wmt_enabled = flags & HAS_SSE2; int SSE3Enabled = flags & HAS_SSE3; int SSSE3Enabled = flags & HAS_SSSE3; + int SSE4_1Enabled = flags & HAS_SSE4_1; /* Note: * @@ -198,7 +199,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) /* Override default functions with fastest ones for this CPU. */ #if HAVE_MMX - if (mmx_enabled) { cpi->rtcd.variance.sad16x16 = vp8_sad16x16_mmx; @@ -254,10 +254,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) /*cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_mmx;*/ } - #endif -#if HAVE_SSE2 +#if HAVE_SSE2 if (wmt_enabled) { cpi->rtcd.variance.sad16x16 = vp8_sad16x16_wmt; @@ -307,10 +306,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) /*cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse2;*/ cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_sse2; } - #endif -#if HAVE_SSE3 +#if HAVE_SSE3 if (SSE3Enabled) { cpi->rtcd.variance.sad16x16 = vp8_sad16x16_sse3; @@ -328,16 +326,27 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad4x4x4d = vp8_sad4x4x4d_sse3; cpi->rtcd.search.diamond_search = vp8_diamond_search_sadx4; } - #endif -#if HAVE_SSSE3 +#if HAVE_SSSE3 if (SSSE3Enabled) { cpi->rtcd.variance.sad16x16x3 = vp8_sad16x16x3_ssse3; cpi->rtcd.variance.sad16x8x3 = vp8_sad16x8x3_ssse3; } +#endif +#if HAVE_SSE4_1 + if (SSE4_1Enabled) + { + cpi->rtcd.variance.sad16x16x8 = vp8_sad16x16x8_sse4; + cpi->rtcd.variance.sad16x8x8 = vp8_sad16x8x8_sse4; + cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_sse4; + cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_sse4; + cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_sse4; + cpi->rtcd.search.full_search = vp8_full_search_sadx8; + } #endif + #endif } diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 98288f2b6..f693f865b 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -109,6 +109,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm +VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h index feffc3e69..190c8643a 100644 --- a/vpx_ports/x86.h +++ b/vpx_ports/x86.h @@ -74,6 +74,7 @@ void __cpuid(int CPUInfo[4], int info_type); #define HAS_SSE2 0x04 #define HAS_SSE3 0x08 #define HAS_SSSE3 0x10 +#define HAS_SSE4_1 0x20 #ifndef BIT #define BIT(n) (1<