From: Yi Luo Date: Fri, 1 Apr 2016 22:50:17 +0000 (-0700) Subject: Optimized HBD 4x4 variance calculation X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=250935cab36276fac67bc38af414b851acc8e6ba;p=libvpx Optimized HBD 4x4 variance calculation vpx_highbd_8/10/12_variance4x4_sse4_1 improves performance ~7%-11%. Change-Id: Ida22bb2a2f7a58037cfd73e186d4f6267a960c02 --- diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c index 4327d974c..0c8ec43eb 100644 --- a/vp10/encoder/mcomp.c +++ b/vp10/encoder/mcomp.c @@ -367,8 +367,8 @@ static unsigned int setup_center_error(const MACROBLOCKD *xd, if (second_pred != NULL) { if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]); - vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset, - y_stride); + vpx_highbd_comp_avg_pred_c(comp_pred16, second_pred, w, h, y + offset, + y_stride); besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1); } else { diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c index 24f42df34..90c8bed52 100644 --- a/vpx_dsp/variance.c +++ b/vpx_dsp/variance.c @@ -433,7 +433,7 @@ uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \ return *sse; \ } -static void highbd_var_filter_block2d_bil_first_pass( +void highbd_var_filter_block2d_bil_first_pass( const uint8_t *src_ptr8, uint16_t *output_ptr, unsigned int src_pixels_per_line, @@ -459,7 +459,7 @@ static void highbd_var_filter_block2d_bil_first_pass( } } -static void highbd_var_filter_block2d_bil_second_pass( +void highbd_var_filter_block2d_bil_second_pass( const uint16_t *src_ptr, uint16_t *output_ptr, unsigned int src_pixels_per_line, @@ -551,8 +551,8 @@ uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ bilinear_filters_2t[yoffset]); \ \ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ + vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ \ return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \ dst_stride, sse); \ @@ -573,8 +573,8 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ bilinear_filters_2t[yoffset]); \ \ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ + vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ \ return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ W, dst, dst_stride, sse); \ @@ -595,8 +595,8 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ bilinear_filters_2t[yoffset]); \ \ - vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \ - CONVERT_TO_BYTEPTR(temp2), W); \ + vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \ + CONVERT_TO_BYTEPTR(temp2), W); \ \ return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \ W, dst, dst_stride, sse); \ @@ -635,9 +635,9 @@ HIGHBD_MSE(16, 8) HIGHBD_MSE(8, 16) HIGHBD_MSE(8, 8) -void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, - int width, int height, const uint8_t *ref8, - int ref_stride) { +void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, + int width, int height, const uint8_t *ref8, + int ref_stride) { int i, j; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h index 161d6474d..4ad23f8ae 100644 --- a/vpx_dsp/variance.h +++ b/vpx_dsp/variance.h @@ -130,6 +130,24 @@ typedef struct vp10_variance_vtable { } vp10_variance_fn_ptr_t; #endif // CONFIG_VP10 +void highbd_var_filter_block2d_bil_first_pass( + const uint8_t *src_ptr8, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter); + +void highbd_var_filter_block2d_bil_second_pass( + const uint16_t *src_ptr, + uint16_t *output_ptr, + unsigned int src_pixels_per_line, + unsigned int pixel_step, + unsigned int output_height, + unsigned int output_width, + const uint8_t *filter); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index e37184965..a9805d704 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -350,6 +350,7 @@ endif # CONFIG_USE_X86INC ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c +DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm ifeq ($(CONFIG_USE_X86INC),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index d01e81d0c..10a5280b9 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1316,10 +1316,17 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if ($w != 128 && $h != 128 && $w != 4 && $h != 4) { specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse2"; } + if ($w == 4 && $h == 4) { + specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse4_1"; + } if ($w != 128 && $h != 128 && $w != 4) { specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", $sse2_x86inc; specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", $sse2_x86inc; } + if ($w == 4 && $h == 4) { + specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1"; + specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1"; + } } } } # CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/highbd_variance_sse4.c b/vpx_dsp/x86/highbd_variance_sse4.c new file mode 100644 index 000000000..18ecc7efd --- /dev/null +++ b/vpx_dsp/x86/highbd_variance_sse4.c @@ -0,0 +1,236 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include /* SSE4.1 */ + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx_dsp/variance.h" +#include "vpx_dsp/vpx_filter.h" + +static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, + uint64_t *sse, int64_t *sum) { + __m128i u0, u1, u2, u3; + __m128i s0, s1, s2, s3; + __m128i t0, t1, x0, y0; + __m128i a0, a1, a2, a3; + __m128i b0, b1, b2, b3; + __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1); + + uint16_t *a = CONVERT_TO_SHORTPTR(a8); + uint16_t *b = CONVERT_TO_SHORTPTR(b8); + + a0 = _mm_loadu_si128((__m128i const *) (a + 0 * a_stride)); + a1 = _mm_loadu_si128((__m128i const *) (a + 1 * a_stride)); + a2 = _mm_loadu_si128((__m128i const *) (a + 2 * a_stride)); + a3 = _mm_loadu_si128((__m128i const *) (a + 3 * a_stride)); + + b0 = _mm_loadu_si128((__m128i const *) (b + 0 * b_stride)); + b1 = _mm_loadu_si128((__m128i const *) (b + 1 * b_stride)); + b2 = _mm_loadu_si128((__m128i const *) (b + 2 * b_stride)); + b3 = _mm_loadu_si128((__m128i const *) (b + 3 * b_stride)); + + u0 = _mm_unpacklo_epi16(a0, a1); + u1 = _mm_unpacklo_epi16(a2, a3); + u2 = _mm_unpacklo_epi16(b0, b1); + u3 = _mm_unpacklo_epi16(b2, b3); + + s0 = _mm_sub_epi16(u0, u2); + s1 = _mm_sub_epi16(u1, u3); + + t0 = _mm_madd_epi16(s0, k_one_epi16); + t1 = _mm_madd_epi16(s1, k_one_epi16); + + s2 = _mm_hadd_epi32(t0, t1); + s3 = _mm_hadd_epi32(s2, s2); + y0 = _mm_hadd_epi32(s3, s3); + + t0 = _mm_madd_epi16(s0, s0); + t1 = _mm_madd_epi16(s1, s1); + + s2 = _mm_hadd_epi32(t0, t1); + s3 = _mm_hadd_epi32(s2, s2); + x0 = _mm_hadd_epi32(s3, s3); + + *sse = (uint64_t)_mm_extract_epi32(x0, 0); + *sum = (int64_t)_mm_extract_epi32(y0, 0); +} + +uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a, + int a_stride, + const uint8_t *b, + int b_stride, + uint32_t *sse) { + int64_t sum; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)local_sse; + + return *sse - ((sum * sum) >> 4); +} + +uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a, + int a_stride, + const uint8_t *b, + int b_stride, + uint32_t *sse) { + int64_t sum; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4); + sum = ROUND_POWER_OF_TWO(sum, 2); + + return *sse - ((sum * sum) >> 4); +} + +uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a, + int a_stride, + const uint8_t *b, + int b_stride, + uint32_t *sse) { + int64_t sum; + uint64_t local_sse; + + variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); + *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8); + sum = ROUND_POWER_OF_TWO(sum, 4); + + return *sse - ((sum * sum) >> 4); +} + +// Sub-pixel +uint32_t vpx_highbd_8_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, + int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, + uint32_t *sse) { + + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1, + 4, bilinear_filters_2t[xoffset]); + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), + 4, dst, dst_stride, sse); +} + +uint32_t vpx_highbd_10_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, + int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, + uint32_t *sse) { + + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1, + 4, bilinear_filters_2t[xoffset]); + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), + 4, dst, dst_stride, sse); +} + +uint32_t vpx_highbd_12_sub_pixel_variance4x4_sse4_1( + const uint8_t *src, int src_stride, + int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, + uint32_t *sse) { + + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + + highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1, + 4, bilinear_filters_2t[xoffset]); + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), + 4, dst, dst_stride, sse); +} + +// Sub-pixel average + +uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, + int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, + uint32_t *sse, + const uint8_t *second_pred) { + + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1, + 4, bilinear_filters_2t[xoffset]); + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); + + return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), + 4, dst, dst_stride, sse); +} + +uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, + int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, + uint32_t *sse, + const uint8_t *second_pred) { + + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1, + 4, bilinear_filters_2t[xoffset]); + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); + + return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), + 4, dst, dst_stride, sse); +} + +uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_sse4_1( + const uint8_t *src, int src_stride, + int xoffset, int yoffset, + const uint8_t *dst, int dst_stride, + uint32_t *sse, + const uint8_t *second_pred) { + + uint16_t fdata3[(4 + 1) * 4]; + uint16_t temp2[4 * 4]; + DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); + + highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1, + 4, bilinear_filters_2t[xoffset]); + highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, + bilinear_filters_2t[yoffset]); + + vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4, + CONVERT_TO_BYTEPTR(temp2), 4); + + return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), + 4, dst, dst_stride, sse); +}