From e253eaa0361fd9cc8e8feca1d344acbf24ea1e21 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Mon, 20 Jul 2015 11:07:22 -0700 Subject: [PATCH] Unify the high bit-depth forward hybrid transforms The SSE2 version high bit-depth forward hybrid transforms are essentially using the C functions via cross referencing to 1-D functions in vp9_dct.c. This commit unifies the two versions and removes the unnecessary dependency. Change-Id: Ib4d0702a138f8daf7d0bd97c141ee7088f293765 --- test/dct16x16_test.cc | 8 --- test/fdct4x4_test.cc | 8 --- vp9/common/vp9_rtcd_defs.pl | 6 +- vp9/encoder/x86/vp9_dct_sse2.c | 102 --------------------------------- 4 files changed, 3 insertions(+), 121 deletions(-) diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index 66ca4bbb9..5a4b1edeb 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -901,14 +901,6 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2, Trans16x16HT, ::testing::Values( - make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 0, VPX_BITS_10), - make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 1, VPX_BITS_10), - make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 2, VPX_BITS_10), - make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_10, 3, VPX_BITS_10), - make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 0, VPX_BITS_12), - make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 1, VPX_BITS_12), - make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 2, VPX_BITS_12), - make_tuple(&vp9_highbd_fht16x16_sse2, &iht16x16_12, 3, VPX_BITS_12), make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8), make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8), diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index 4ee4ad444..08a5ff6a4 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -531,14 +531,6 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( SSE2, Trans4x4HT, ::testing::Values( - make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 0, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 1, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 2, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_10, 3, VPX_BITS_10), - make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 0, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 1, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 2, VPX_BITS_12), - make_tuple(&vp9_highbd_fht4x4_sse2, &iht4x4_12, 3, VPX_BITS_12), make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8), make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8), make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8), diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 415b2e9de..102f2a09c 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -933,13 +933,13 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # fdct functions add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_highbd_fht4x4 sse2/; + specialize qw/vp9_highbd_fht4x4/; add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_highbd_fht8x8 sse2/; + specialize qw/vp9_highbd_fht8x8/; add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type"; - specialize qw/vp9_highbd_fht16x16 sse2/; + specialize qw/vp9_highbd_fht16x16/; add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vp9_highbd_fwht4x4/; diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index cff4fcbdc..9e13a683e 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -2266,108 +2266,6 @@ void vp9_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, store_output(&in1, output); } -#if CONFIG_VP9_HIGHBITDEPTH -/* These SSE2 versions of the FHT functions only actually use SSE2 in the - * DCT_DCT case in all other cases, they revert to C code which is identical - * to that used by the C versions of them. - */ - -void vp9_highbd_fht4x4_sse2(const int16_t *input, tran_low_t *output, - int stride, int tx_type) { - if (tx_type == DCT_DCT) { - vp9_highbd_fdct4x4_sse2(input, output, stride); - } else { - tran_low_t out[4 * 4]; - tran_low_t *outptr = &out[0]; - int i, j; - tran_low_t temp_in[4], temp_out[4]; - const transform_2d ht = FHT_4[tx_type]; - - // Columns - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = input[j * stride + i] * 16; - if (i == 0 && temp_in[0]) - temp_in[0] += 1; - ht.cols(temp_in, temp_out); - for (j = 0; j < 4; ++j) - outptr[j * 4 + i] = temp_out[j]; - } - - // Rows - for (i = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - temp_in[j] = out[j + i * 4]; - ht.rows(temp_in, temp_out); - for (j = 0; j < 4; ++j) - output[j + i * 4] = (temp_out[j] + 1) >> 2; - } - } -} - -void vp9_highbd_fht8x8_sse2(const int16_t *input, tran_low_t *output, - int stride, int tx_type) { - if (tx_type == DCT_DCT) { - vp9_highbd_fdct8x8_sse2(input, output, stride); - } else { - tran_low_t out[64]; - tran_low_t *outptr = &out[0]; - int i, j; - tran_low_t temp_in[8], temp_out[8]; - const transform_2d ht = FHT_8[tx_type]; - - // Columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = input[j * stride + i] * 4; - ht.cols(temp_in, temp_out); - for (j = 0; j < 8; ++j) - outptr[j * 8 + i] = temp_out[j]; - } - - // Rows - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) - temp_in[j] = out[j + i * 8]; - ht.rows(temp_in, temp_out); - for (j = 0; j < 8; ++j) - output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; - } - } -} - -void vp9_highbd_fht16x16_sse2(const int16_t *input, tran_low_t *output, - int stride, int tx_type) { - if (tx_type == DCT_DCT) { - vp9_highbd_fdct16x16_sse2(input, output, stride); - } else { - tran_low_t out[256]; - tran_low_t *outptr = &out[0]; - int i, j; - tran_low_t temp_in[16], temp_out[16]; - const transform_2d ht = FHT_16[tx_type]; - - // Columns - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = input[j * stride + i] * 4; - ht.cols(temp_in, temp_out); - for (j = 0; j < 16; ++j) - outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; - } - - // Rows - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = out[j + i * 16]; - ht.rows(temp_in, temp_out); - for (j = 0; j < 16; ++j) - output[j + i * 16] = temp_out[j]; - } - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH - /* * The DCTnxn functions are defined using the macros below. The main code for * them is in separate files (vp9/encoder/x86/vp9_dct_sse2_impl.h & -- 2.40.0