From: Yunqing Wang Date: Tue, 8 Jan 2013 18:44:19 +0000 (-0800) Subject: vp9_sub_pixel_variance16x2 SSE2 optimization X-Git-Tag: v1.3.0~1210^2~26^2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=8d568312a2e6882a336eb3525fbe6b9e752163f3;p=libvpx vp9_sub_pixel_variance16x2 SSE2 optimization About 5% decoder speedup. Change-Id: Ib6687d337af758a536a0e7e289f400990f1f9794 --- diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c index 85982fc18..54fd8d36f 100644 --- a/vp9/common/vp9_findnearmv.c +++ b/vp9/common/vp9_findnearmv.c @@ -185,33 +185,33 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, offset = ref_y_stride * row_offset + col_offset; score = 0; if (xd->up_available) { - vp9_sub_pixel_variance16x2_c(above_ref + offset, ref_y_stride, - SP(this_mv.as_mv.col), - SP(this_mv.as_mv.row), - above_src, xd->dst.y_stride, &sse); + vp9_sub_pixel_variance16x2(above_ref + offset, ref_y_stride, + SP(this_mv.as_mv.col), + SP(this_mv.as_mv.row), + above_src, xd->dst.y_stride, &sse); score += sse; #if CONFIG_SUPERBLOCKS if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB32X32) { - vp9_sub_pixel_variance16x2_c(above_ref + offset + 16, - ref_y_stride, - SP(this_mv.as_mv.col), - SP(this_mv.as_mv.row), - above_src + 16, xd->dst.y_stride, &sse); + vp9_sub_pixel_variance16x2(above_ref + offset + 16, + ref_y_stride, + SP(this_mv.as_mv.col), + SP(this_mv.as_mv.row), + above_src + 16, xd->dst.y_stride, &sse); score += sse; } #if CONFIG_SUPERBLOCKS64 if (xd->mode_info_context->mbmi.sb_type >= BLOCK_SIZE_SB64X64) { - vp9_sub_pixel_variance16x2_c(above_ref + offset + 32, - ref_y_stride, - SP(this_mv.as_mv.col), - SP(this_mv.as_mv.row), - above_src + 32, xd->dst.y_stride, &sse); + vp9_sub_pixel_variance16x2(above_ref + offset + 32, + ref_y_stride, + SP(this_mv.as_mv.col), + SP(this_mv.as_mv.row), + above_src + 32, xd->dst.y_stride, &sse); score += sse; - vp9_sub_pixel_variance16x2_c(above_ref + offset + 48, - ref_y_stride, - SP(this_mv.as_mv.col), - SP(this_mv.as_mv.row), - above_src + 48, xd->dst.y_stride, &sse); + vp9_sub_pixel_variance16x2(above_ref + offset + 48, + ref_y_stride, + SP(this_mv.as_mv.col), + SP(this_mv.as_mv.row), + above_src + 48, xd->dst.y_stride, &sse); score += sse; } #endif diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index f02ee0260..e41c18ab0 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -252,6 +252,11 @@ specialize vp9_sad16x3 sse2 prototype unsigned int vp9_sad3x16 "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride" specialize vp9_sad3x16 sse2 +if [ "$CONFIG_SUBPELREFMV" = "yes" ]; then +prototype unsigned int vp9_sub_pixel_variance16x2 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int Refstride, unsigned int *sse" +specialize vp9_sub_pixel_variance16x2 sse2 +fi + # # Sub Pixel Filters # diff --git a/vp9/common/x86/vp9_subpixel_variance_sse2.c b/vp9/common/x86/vp9_subpixel_variance_sse2.c new file mode 100644 index 000000000..91cd75f22 --- /dev/null +++ b/vp9/common/x86/vp9_subpixel_variance_sse2.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#define HALFNDX 8 + +void vp9_half_horiz_variance16x_h_sse2(const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared); + +void vp9_half_vert_variance16x_h_sse2(const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared); + +void vp9_half_horiz_vert_variance16x_h_sse2(const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int *sum, + unsigned int *sumsquared); + +void vp9_filter_block2d_bil_var_sse2(const unsigned char *ref_ptr, + int ref_pixels_per_line, + const unsigned char *src_ptr, + int src_pixels_per_line, + unsigned int Height, + int xoffset, + int yoffset, + int *sum, + unsigned int *sumsquared); + +unsigned int vp9_sub_pixel_variance16x2_sse2(const unsigned char *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const unsigned char *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse) { + int xsum0, xsum1; + unsigned int xxsum0, xxsum1; + + if (xoffset == HALFNDX && yoffset == 0) { + vp9_half_horiz_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 2, + &xsum0, &xxsum0); + } else if (xoffset == 0 && yoffset == HALFNDX) { + vp9_half_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 2, + &xsum0, &xxsum0); + } else if (xoffset == HALFNDX && yoffset == HALFNDX) { + vp9_half_horiz_vert_variance16x_h_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 2, + &xsum0, &xxsum0); + } else { + vp9_filter_block2d_bil_var_sse2( + src_ptr, src_pixels_per_line, + dst_ptr, dst_pixels_per_line, 2, + xoffset, yoffset, + &xsum0, &xxsum0); + + vp9_filter_block2d_bil_var_sse2( + src_ptr + 8, src_pixels_per_line, + dst_ptr + 8, dst_pixels_per_line, 2, + xoffset, yoffset, + &xsum1, &xxsum1); + xsum0 += xsum1; + xxsum0 += xxsum1; + } + + *sse = xxsum0; + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 5)); +} diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 4d17233e7..d84c65cf9 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -95,6 +95,9 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_wrapper_sse2.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm +ifeq ($(CONFIG_SUBPELREFMV),yes) +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_variance_sse2.c +endif VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_ssse3.asm ifeq ($(CONFIG_POSTPROC),yes)