SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2),
SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2),
#if CONFIG_VP9_HIGHBITDEPTH
+ SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 8),
+ SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 8),
SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 8),
SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 8),
SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 8),
SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8),
SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8),
SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8),
+ SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 10),
+ SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 10),
SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 10),
SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 10),
SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 10),
SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10),
SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10),
SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10),
+ SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 12),
+ SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 12),
SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 12),
SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 12),
SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 12),
# Multi-block SAD, comparing a reference to N independent blocks
#
add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad64x64x4d sse2 neon/;
+ specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
- specialize qw/vpx_highbd_sad64x32x4d sse2 neon/;
+ specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/;
add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/;
_mm_storeu_si128((__m128i *)sad_array, sum);
}
+static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
+ const uint16_t *src,
+ int src_stride,
+ uint16_t *refs[4],
+ int ref_stride, int height) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ // load src and all ref[]
+ const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+ const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+ const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+ int x;
+
+ for (x = 0; x < 4; ++x) {
+ __m256i r[4];
+ r[0] = _mm256_loadu_si256((const __m256i *)refs[x]);
+ r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32));
+ r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48));
+
+ // absolute differences between every ref[] to src
+ r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0));
+ r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1));
+ r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2));
+ r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3));
+
+ // sum every abs diff
+ sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1]));
+ sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3]));
+ }
+
+ src += src_stride;
+ refs[0] += ref_stride;
+ refs[1] += ref_stride;
+ refs[2] += ref_stride;
+ refs[3] += ref_stride;
+ }
+}
+
+#define HIGHBD_SAD64XNX4D(n) \
+ void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \
+ uint16_t *refs[4]; \
+ __m256i sums_16[4]; \
+ __m256i sums_32[4]; \
+ int i; \
+ \
+ refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]); \
+ refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]); \
+ refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]); \
+ refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]); \
+ sums_32[0] = _mm256_setzero_si256(); \
+ sums_32[1] = _mm256_setzero_si256(); \
+ sums_32[2] = _mm256_setzero_si256(); \
+ sums_32[3] = _mm256_setzero_si256(); \
+ \
+ for (i = 0; i < (n / 2); ++i) { \
+ sums_16[0] = _mm256_setzero_si256(); \
+ sums_16[1] = _mm256_setzero_si256(); \
+ sums_16[2] = _mm256_setzero_si256(); \
+ sums_16[3] = _mm256_setzero_si256(); \
+ \
+ highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2); \
+ \
+ /* sums_16 will outrange after 2 rows, so add current sums_16 to \
+ * sums_32*/ \
+ sums_32[0] = _mm256_add_epi32( \
+ sums_32[0], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[0], 1)))); \
+ sums_32[1] = _mm256_add_epi32( \
+ sums_32[1], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[1], 1)))); \
+ sums_32[2] = _mm256_add_epi32( \
+ sums_32[2], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[2], 1)))); \
+ sums_32[3] = _mm256_add_epi32( \
+ sums_32[3], \
+ _mm256_add_epi32( \
+ _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])), \
+ _mm256_cvtepu16_epi32( \
+ _mm256_extractf128_si256(sums_16[3], 1)))); \
+ \
+ src += src_stride << 1; \
+ } \
+ calc_final_4(sums_32, sad_array); \
+ }
+
+// 64x64
+HIGHBD_SAD64XNX4D(64)
+
+// 64x32
+HIGHBD_SAD64XNX4D(32)
+
static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
const uint16_t *src,
int src_stride,