From: Yi Luo <luoyi@google.com>
Date: Fri, 1 Apr 2016 22:50:17 +0000 (-0700)
Subject: Optimized HBD 4x4 variance calculation
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=250935cab36276fac67bc38af414b851acc8e6ba;p=libvpx

Optimized HBD 4x4 variance calculation

vpx_highbd_8/10/12_variance4x4_sse4_1 improves performance ~7%-11%.

Change-Id: Ida22bb2a2f7a58037cfd73e186d4f6267a960c02
---

diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 4327d974c..0c8ec43eb 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -367,8 +367,8 @@ static unsigned int setup_center_error(const MACROBLOCKD *xd,
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
-      vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
-                               y_stride);
+      vpx_highbd_comp_avg_pred_c(comp_pred16, second_pred, w, h, y + offset,
+                                 y_stride);
       besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
                         sse1);
     } else {
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 24f42df34..90c8bed52 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -433,7 +433,7 @@ uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
   return *sse; \
 }
 
-static void highbd_var_filter_block2d_bil_first_pass(
+void highbd_var_filter_block2d_bil_first_pass(
     const uint8_t *src_ptr8,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -459,7 +459,7 @@ static void highbd_var_filter_block2d_bil_first_pass(
   }
 }
 
-static void highbd_var_filter_block2d_bil_second_pass(
+void highbd_var_filter_block2d_bil_second_pass(
     const uint16_t *src_ptr,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -551,8 +551,8 @@ uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                             bilinear_filters_2t[yoffset]); \
 \
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+                             CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
                                           dst_stride, sse); \
@@ -573,8 +573,8 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                             bilinear_filters_2t[yoffset]); \
 \
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+                             CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                              W, dst, dst_stride, sse); \
@@ -595,8 +595,8 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                             bilinear_filters_2t[yoffset]); \
 \
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+                             CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                              W, dst, dst_stride, sse); \
@@ -635,9 +635,9 @@ HIGHBD_MSE(16, 8)
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
 
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                              int width, int height, const uint8_t *ref8,
-                              int ref_stride) {
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+                                int width, int height, const uint8_t *ref8,
+                                int ref_stride) {
   int i, j;
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index 161d6474d..4ad23f8ae 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -130,6 +130,24 @@ typedef struct vp10_variance_vtable {
 } vp10_variance_fn_ptr_t;
 #endif  // CONFIG_VP10
 
+void highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter);
+
+void highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index e37184965..a9805d704 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -350,6 +350,7 @@ endif  # CONFIG_USE_X86INC
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d01e81d0c..10a5280b9 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1316,10 +1316,17 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
       if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
         specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse2";
       }
+      if ($w == 4 && $h == 4) {
+        specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse4_1";
+      }
       if ($w != 128 && $h != 128 && $w != 4) {
         specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", $sse2_x86inc;
         specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", $sse2_x86inc;
       }
+      if ($w == 4 && $h == 4) {
+        specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
+        specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
+      }
     }
   }
 }  # CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/highbd_variance_sse4.c b/vpx_dsp/x86/highbd_variance_sse4.c
new file mode 100644
index 000000000..18ecc7efd
--- /dev/null
+++ b/vpx_dsp/x86/highbd_variance_sse4.c
@@ -0,0 +1,236 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/variance.h"
+#include "vpx_dsp/vpx_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+                                         const uint8_t *b8, int b_stride,
+                                         uint64_t *sse, int64_t *sum) {
+  __m128i u0, u1, u2, u3;
+  __m128i s0, s1, s2, s3;
+  __m128i t0, t1, x0, y0;
+  __m128i a0, a1, a2, a3;
+  __m128i b0, b1, b2, b3;
+  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  a0 = _mm_loadu_si128((__m128i const *) (a + 0 * a_stride));
+  a1 = _mm_loadu_si128((__m128i const *) (a + 1 * a_stride));
+  a2 = _mm_loadu_si128((__m128i const *) (a + 2 * a_stride));
+  a3 = _mm_loadu_si128((__m128i const *) (a + 3 * a_stride));
+
+  b0 = _mm_loadu_si128((__m128i const *) (b + 0 * b_stride));
+  b1 = _mm_loadu_si128((__m128i const *) (b + 1 * b_stride));
+  b2 = _mm_loadu_si128((__m128i const *) (b + 2 * b_stride));
+  b3 = _mm_loadu_si128((__m128i const *) (b + 3 * b_stride));
+
+  u0 = _mm_unpacklo_epi16(a0, a1);
+  u1 = _mm_unpacklo_epi16(a2, a3);
+  u2 = _mm_unpacklo_epi16(b0, b1);
+  u3 = _mm_unpacklo_epi16(b2, b3);
+
+  s0 = _mm_sub_epi16(u0, u2);
+  s1 = _mm_sub_epi16(u1, u3);
+
+  t0 = _mm_madd_epi16(s0, k_one_epi16);
+  t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  y0 = _mm_hadd_epi32(s3, s3);
+
+  t0 = _mm_madd_epi16(s0, s0);
+  t1 = _mm_madd_epi16(s1, s1);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  x0 = _mm_hadd_epi32(s3, s3);
+
+  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+  *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a,
+                                         int a_stride,
+                                         const uint8_t *b,
+                                         int b_stride,
+                                         uint32_t *sse) {
+  int64_t sum;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)local_sse;
+
+  return *sse - ((sum * sum) >> 4);
+}
+
+uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a,
+                                          int a_stride,
+                                          const uint8_t *b,
+                                          int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+  sum = ROUND_POWER_OF_TWO(sum, 2);
+
+  return *sse - ((sum * sum) >> 4);
+}
+
+uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a,
+                                          int a_stride,
+                                          const uint8_t *b,
+                                          int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+  sum = ROUND_POWER_OF_TWO(sum, 4);
+
+  return *sse - ((sum * sum) >> 4);
+}
+
+// Sub-pixel
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+                                           4, bilinear_filters_2t[xoffset]);
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                            bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                  4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+                                           4, bilinear_filters_2t[xoffset]);
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                            bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                   4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+                                           4, bilinear_filters_2t[xoffset]);
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                            bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                   4, dst, dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse,
+    const uint8_t *second_pred) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+                                           4, bilinear_filters_2t[xoffset]);
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                            bilinear_filters_2t[yoffset]);
+
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
+                             CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+                                  4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse,
+    const uint8_t *second_pred) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+                                           4, bilinear_filters_2t[xoffset]);
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                            bilinear_filters_2t[yoffset]);
+
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
+                             CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+                                   4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse,
+    const uint8_t *second_pred) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+                                           4, bilinear_filters_2t[xoffset]);
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                            bilinear_filters_2t[yoffset]);
+
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
+                             CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+                                   4, dst, dst_stride, sse);
+}