Check(expected);
}
+TEST_P(SatdTest, DISABLED_Speed) {
+ const int kCountSpeedTestBlock = 20000;
+ vpx_usec_timer timer;
+ DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);
+ const int blocksize = GET_PARAM(0);
+
+ vpx_usec_timer_start(&timer);
+ for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+ GET_PARAM(1)(coeff, blocksize);
+ }
+ vpx_usec_timer_mark(&timer);
+ const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+
TEST_P(BlockErrorTestFP, MinValue) {
const int64_t kMin = -32640;
const int64_t expected = kMin * kMin * txfm_size_;
#endif // HAVE_SSE2
#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, SatdTest,
+ ::testing::Values(make_tuple(16, &vpx_satd_avx2),
+ make_tuple(64, &vpx_satd_avx2),
+ make_tuple(256, &vpx_satd_avx2),
+ make_tuple(1024, &vpx_satd_avx2)));
+
INSTANTIATE_TEST_CASE_P(
AVX2, BlockErrorTestFP,
::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2),
make_tuple(64, &vp9_block_error_fp_avx2),
make_tuple(256, &vp9_block_error_fp_avx2),
make_tuple(1024, &vp9_block_error_fp_avx2)));
-#endif // HAVE_AVX2
+#endif
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
- specialize qw/vpx_satd sse2 neon/;
+ specialize qw/vpx_satd avx2 sse2 neon/;
} else {
add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
- specialize qw/vpx_satd sse2 neon msa/;
+ specialize qw/vpx_satd avx2 sse2 neon msa/;
}
add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
t_coeff += 16;
}
}
+
+int vpx_satd_avx2(const tran_low_t *coeff, int length) {
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i accum = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < length; i += 16) {
+ const __m256i src_line = load_tran_low(coeff);
+ const __m256i abs = _mm256_abs_epi16(src_line);
+ const __m256i sum = _mm256_madd_epi16(abs, one);
+ accum = _mm256_add_epi32(accum, sum);
+ coeff += 16;
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}