Change register loading to fix stack overflow issue

author Yi Luo <luoyi@google.com>

Thu, 23 Jun 2016 21:31:26 +0000 (14:31 -0700)

committer Yi Luo <luoyi@google.com>

Fri, 24 Jun 2016 17:39:49 +0000 (10:39 -0700)
author Yi Luo <luoyi@google.com>
Thu, 23 Jun 2016 21:31:26 +0000 (14:31 -0700)
committer Yi Luo <luoyi@google.com>
Fri, 24 Jun 2016 17:39:49 +0000 (10:39 -0700)
diff --git a/test/variance_test.cc b/test/variance_test.cc

index f874f1dd8da89ca7e3eae3047b851a3e0c57d0e7..8ac85118ca8e6b9c4bb7b6c3b026cbf891a746f7 100644 (file)
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -1151,7 +1151,9 @@ INSTANTIATE_TEST_CASE_P(
  INSTANTIATE_TEST_CASE_P(
      SSE4_1, VpxSubpelVarianceTest,
      ::testing::Values(
-         make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_sse4_1, 8)));
+         make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_sse4_1, 8),
+         make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_sse4_1, 10),
+         make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_sse4_1, 12)));
  
  INSTANTIATE_TEST_CASE_P(
      SSE4_1, VpxSubpelAvgVarianceTest,
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index 61b0f2d18b84fa0fdabb973f78eae7a65bd752bb..2fb61f103d4a5c7a71e920ceb46fbf528cf877a8 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1359,8 +1359,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
        }
      }
    }
-  $vpx_highbd_10_sub_pixel_variance4x4_sse4_1='';
-  $vpx_highbd_12_sub_pixel_variance4x4_sse4_1='';
  }  # CONFIG_VP9_HIGHBITDEPTH
  
  if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
diff --git a/vpx_dsp/x86/highbd_variance_sse4.c b/vpx_dsp/x86/highbd_variance_sse4.c

index 82b4deb4671168c3deeb249e4ff61bf0172688ec..4d0b75deab782f90b3efc9d27b7e0e88568a6aa1 100644 (file)
--- a/vpx_dsp/x86/highbd_variance_sse4.c
+++ b/vpx_dsp/x86/highbd_variance_sse4.c
@@ -29,15 +29,15 @@ static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
    uint16_t *a = CONVERT_TO_SHORTPTR(a8);
    uint16_t *b = CONVERT_TO_SHORTPTR(b8);
  
-  a0 = _mm_loadu_si128((__m128i const *) (a + 0 * a_stride));
-  a1 = _mm_loadu_si128((__m128i const *) (a + 1 * a_stride));
-  a2 = _mm_loadu_si128((__m128i const *) (a + 2 * a_stride));
-  a3 = _mm_loadu_si128((__m128i const *) (a + 3 * a_stride));
+  a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
+  a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
+  a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
+  a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
  
-  b0 = _mm_loadu_si128((__m128i const *) (b + 0 * b_stride));
-  b1 = _mm_loadu_si128((__m128i const *) (b + 1 * b_stride));
-  b2 = _mm_loadu_si128((__m128i const *) (b + 2 * b_stride));
-  b3 = _mm_loadu_si128((__m128i const *) (b + 3 * b_stride));
+  b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
+  b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
+  b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
+  b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
  
    u0 = _mm_unpacklo_epi16(a0, a1);
    u1 = _mm_unpacklo_epi16(a2, a3);
@@ -130,6 +130,44 @@ uint32_t vpx_highbd_8_sub_pixel_variance4x4_sse4_1(
                                    4, dst, dst_stride, sse);
  }
  
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                   4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                   4, dst, dst_stride, sse);
+}
+
  // Sub-pixel average
  
  uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
author	Yi Luo <luoyi@google.com>
	Thu, 23 Jun 2016 21:31:26 +0000 (14:31 -0700)
committer	Yi Luo <luoyi@google.com>
	Fri, 24 Jun 2016 17:39:49 +0000 (10:39 -0700)
test/variance_test.cc		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history
vpx_dsp/x86/highbd_variance_sse4.c		patch \| blob \| history