From: Johann Date: Fri, 27 Jan 2017 21:37:36 +0000 (-0800) Subject: hadamard highbd sse2: use tran_low_t for coeff X-Git-Tag: v1.7.0~760^2~2 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2dac808dd17b866eea3a9dc5e49d5bcd2d36e508;p=libvpx hadamard highbd sse2: use tran_low_t for coeff BUG=webm:1365 Change-Id: Ica414007d8412ceebfffa9e58e8416226a3fe934 --- diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc index 962cddb51..bc09e952d 100644 --- a/test/hadamard_test.cc +++ b/test/hadamard_test.cc @@ -145,14 +145,14 @@ TEST_P(Hadamard8x8Test, VaryStride) { INSTANTIATE_TEST_CASE_P(C, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_c)); -// TODO(jingning): Remove highbitdepth flag when the SIMD functions are -// in place and turn on the unit test. -#if !CONFIG_VP9_HIGHBITDEPTH #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P(SSE2, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_sse2)); #endif // HAVE_SSE2 +// TODO(jingning): Remove highbitdepth flag when the SIMD functions are +// in place and turn on the unit test. +#if !CONFIG_VP9_HIGHBITDEPTH #if HAVE_SSSE3 && ARCH_X86_64 INSTANTIATE_TEST_CASE_P(SSSE3, Hadamard8x8Test, ::testing::Values(&vpx_hadamard_8x8_ssse3)); @@ -212,7 +212,6 @@ TEST_P(Hadamard16x16Test, VaryStride) { } } -#if !CONFIG_VP9_HIGHBITDEPTH INSTANTIATE_TEST_CASE_P(C, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_c)); @@ -221,6 +220,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_sse2)); #endif // HAVE_SSE2 +#if !CONFIG_VP9_HIGHBITDEPTH #if HAVE_NEON INSTANTIATE_TEST_CASE_P(NEON, Hadamard16x16Test, ::testing::Values(&vpx_hadamard_16x16_neon)); diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 3cb2011b8..133f2fbcd 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -888,10 +888,10 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_8x8/; + specialize qw/vpx_hadamard_8x8 sse2/; add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; - specialize qw/vpx_hadamard_16x16/; + specialize qw/vpx_hadamard_16x16 sse2/; add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; specialize qw/vpx_satd/; diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c index b0a104bad..955d9ceab 100644 --- a/vpx_dsp/x86/avg_intrin_sse2.c +++ b/vpx_dsp/x86/avg_intrin_sse2.c @@ -11,6 +11,8 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/fdct.h" #include "vpx_ports/mem.h" void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, @@ -213,7 +215,7 @@ static void hadamard_col8_sse2(__m128i *in, int iter) { } void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { __m128i src[8]; src[0] = _mm_load_si128((const __m128i *)src_diff); src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); @@ -227,25 +229,25 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, hadamard_col8_sse2(src, 0); hadamard_col8_sse2(src, 1); - _mm_store_si128((__m128i *)coeff, src[0]); + store_tran_low(src[0], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[1]); + store_tran_low(src[1], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[2]); + store_tran_low(src[2], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[3]); + store_tran_low(src[3], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[4]); + store_tran_low(src[4], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[5]); + store_tran_low(src[5], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[6]); + store_tran_low(src[6], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[7]); + store_tran_low(src[7], coeff); } void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { int16_t const *src_ptr = @@ -254,10 +256,10 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, } for (idx = 0; idx < 64; idx += 8) { - __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); - __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); - __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); - __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); + __m128i coeff0 = load_tran_low(coeff); + __m128i coeff1 = load_tran_low(coeff + 64); + __m128i coeff2 = load_tran_low(coeff + 128); + __m128i coeff3 = load_tran_low(coeff + 192); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); @@ -271,13 +273,13 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); - _mm_store_si128((__m128i *)coeff, coeff0); - _mm_store_si128((__m128i *)(coeff + 64), coeff1); + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 64); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); - _mm_store_si128((__m128i *)(coeff + 128), coeff2); - _mm_store_si128((__m128i *)(coeff + 192), coeff3); + store_tran_low(coeff2, coeff + 128); + store_tran_low(coeff3, coeff + 192); coeff += 8; }