return *sse; \
}
-static void highbd_var_filter_block2d_bil_first_pass(
+void highbd_var_filter_block2d_bil_first_pass(
const uint8_t *src_ptr8,
uint16_t *output_ptr,
unsigned int src_pixels_per_line,
}
}
-static void highbd_var_filter_block2d_bil_second_pass(
+void highbd_var_filter_block2d_bil_second_pass(
const uint16_t *src_ptr,
uint16_t *output_ptr,
unsigned int src_pixels_per_line,
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters_2t[yoffset]); \
\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
dst_stride, sse); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters_2t[yoffset]); \
\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
W, dst, dst_stride, sse); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters_2t[yoffset]); \
\
- vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
- CONVERT_TO_BYTEPTR(temp2), W); \
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+ CONVERT_TO_BYTEPTR(temp2), W); \
\
return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
W, dst, dst_stride, sse); \
HIGHBD_MSE(8, 16)
HIGHBD_MSE(8, 8)
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
- int width, int height, const uint8_t *ref8,
- int ref_stride) {
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride) {
int i, j;
uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
--- /dev/null
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/variance.h"
+#include "vpx_dsp/vpx_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ uint64_t *sse, int64_t *sum) {
+ __m128i u0, u1, u2, u3;
+ __m128i s0, s1, s2, s3;
+ __m128i t0, t1, x0, y0;
+ __m128i a0, a1, a2, a3;
+ __m128i b0, b1, b2, b3;
+ __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ a0 = _mm_loadu_si128((__m128i const *) (a + 0 * a_stride));
+ a1 = _mm_loadu_si128((__m128i const *) (a + 1 * a_stride));
+ a2 = _mm_loadu_si128((__m128i const *) (a + 2 * a_stride));
+ a3 = _mm_loadu_si128((__m128i const *) (a + 3 * a_stride));
+
+ b0 = _mm_loadu_si128((__m128i const *) (b + 0 * b_stride));
+ b1 = _mm_loadu_si128((__m128i const *) (b + 1 * b_stride));
+ b2 = _mm_loadu_si128((__m128i const *) (b + 2 * b_stride));
+ b3 = _mm_loadu_si128((__m128i const *) (b + 3 * b_stride));
+
+ u0 = _mm_unpacklo_epi16(a0, a1);
+ u1 = _mm_unpacklo_epi16(a2, a3);
+ u2 = _mm_unpacklo_epi16(b0, b1);
+ u3 = _mm_unpacklo_epi16(b2, b3);
+
+ s0 = _mm_sub_epi16(u0, u2);
+ s1 = _mm_sub_epi16(u1, u3);
+
+ t0 = _mm_madd_epi16(s0, k_one_epi16);
+ t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+ s2 = _mm_hadd_epi32(t0, t1);
+ s3 = _mm_hadd_epi32(s2, s2);
+ y0 = _mm_hadd_epi32(s3, s3);
+
+ t0 = _mm_madd_epi16(s0, s0);
+ t1 = _mm_madd_epi16(s1, s1);
+
+ s2 = _mm_hadd_epi32(t0, t1);
+ s3 = _mm_hadd_epi32(s2, s2);
+ x0 = _mm_hadd_epi32(s3, s3);
+
+ *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+ *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a,
+ int a_stride,
+ const uint8_t *b,
+ int b_stride,
+ uint32_t *sse) {
+ int64_t sum;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)local_sse;
+
+ return *sse - ((sum * sum) >> 4);
+}
+
+uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a,
+ int a_stride,
+ const uint8_t *b,
+ int b_stride,
+ uint32_t *sse) {
+ int64_t sum;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+ sum = ROUND_POWER_OF_TWO(sum, 2);
+
+ return *sse - ((sum * sum) >> 4);
+}
+
+uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a,
+ int a_stride,
+ const uint8_t *b,
+ int b_stride,
+ uint32_t *sse) {
+ int64_t sum;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+ sum = ROUND_POWER_OF_TWO(sum, 4);
+
+ return *sse - ((sum * sum) >> 4);
+}
+
+// Sub-pixel
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ uint32_t *sse) {
+
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+ 4, bilinear_filters_2t[xoffset]);
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+ 4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ uint32_t *sse) {
+
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+ 4, bilinear_filters_2t[xoffset]);
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+ 4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ uint32_t *sse) {
+
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+ 4, bilinear_filters_2t[xoffset]);
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+ 4, dst, dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ uint32_t *sse,
+ const uint8_t *second_pred) {
+
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+ 4, bilinear_filters_2t[xoffset]);
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
+
+ return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+ 4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ uint32_t *sse,
+ const uint8_t *second_pred) {
+
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+ 4, bilinear_filters_2t[xoffset]);
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
+
+ return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+ 4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride,
+ int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride,
+ uint32_t *sse,
+ const uint8_t *second_pred) {
+
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, 4 + 1,
+ 4, bilinear_filters_2t[xoffset]);
+ highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ vpx_highbd_comp_avg_pred_c(temp3, second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
+
+ return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+ 4, dst, dst_stride, sse);
+}