// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
// 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4threads) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) {
cfg_.rc_buf_initial_sz = 500;
cfg_.rc_buf_optimal_sz = 500;
cfg_.rc_buf_sz = 1000;
VPX_BITS_8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
NEON, Trans32x32Test,
- ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_neon,
- 0, VPX_BITS_8),
+ ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
+ &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8),
make_tuple(&vpx_fdct32x32_rd_c,
&vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
-#endif // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
// TODO(johannkoenig): high bit depth fdct8x8.
INSTANTIATE_TEST_CASE_P(
SSSE3, TransDCT,
- ::testing::Values(make_tuple(&vpx_fdct32x32_c,
- &vpx_idct32x32_1024_add_ssse3, 32, 0,
- VPX_BITS_8),
+ ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2,
+ 32, 0, VPX_BITS_8),
make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_sse2, 8, 0,
VPX_BITS_8)));
#else
// vpx_fdct8x8_ssse3 is only available in 64 bit builds.
INSTANTIATE_TEST_CASE_P(
SSSE3, TransDCT,
- ::testing::Values(make_tuple(&vpx_fdct32x32_c,
- &vpx_idct32x32_1024_add_ssse3, 32, 0,
- VPX_BITS_8),
+ ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2,
+ 32, 0, VPX_BITS_8),
make_tuple(&vpx_fdct8x8_ssse3, &vpx_idct8x8_64_add_sse2,
8, 0, VPX_BITS_8)));
#endif // !ARCH_X86_64
#if !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(
NEON, TransDCT,
- ::testing::Values(make_tuple(&vpx_fdct16x16_neon,
+ ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
+ &vpx_idct32x32_1024_add_neon, 32, 0,
+ VPX_BITS_8),
+ make_tuple(&vpx_fdct16x16_neon,
&vpx_idct16x16_256_add_neon, 16, 0,
VPX_BITS_8),
make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 8,
INSTANTIATE_TEST_CASE_P(C, PartialIDctTest,
::testing::ValuesIn(c_partial_idct_tests));
-#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON
const PartialInvTxfmParam neon_partial_idct_tests[] = {
#if CONFIG_VP9_HIGHBITDEPTH
make_tuple(&vpx_highbd_fdct32x32_c,
INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest,
::testing::ValuesIn(neon_partial_idct_tests));
-#endif // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#endif // HAVE_NEON
-#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSE2
// 32x32_135_ is implemented using the 1024 version.
const PartialInvTxfmParam sse2_partial_idct_tests[] = {
#if CONFIG_VP9_HIGHBITDEPTH
INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest,
::testing::ValuesIn(sse2_partial_idct_tests));
-#endif // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+#endif // HAVE_SSE2
-#if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSSE3
const PartialInvTxfmParam ssse3_partial_idct_tests[] = {
- make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
- &wrapper<vpx_idct32x32_1024_add_ssse3>, TX_32X32, 1024, 8, 1),
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
&wrapper<vpx_idct32x32_135_add_ssse3>, TX_32X32, 135, 8, 1),
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest,
::testing::ValuesIn(ssse3_partial_idct_tests));
-#endif // HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE
+#endif // HAVE_SSSE3
+
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+const PartialInvTxfmParam sse4_1_partial_idct_tests[] = {
+ make_tuple(
+ &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 8, 2),
+ make_tuple(
+ &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 10, 2),
+ make_tuple(
+ &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+ &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 12, 2)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, PartialIDctTest,
+ ::testing::ValuesIn(sse4_1_partial_idct_tests));
+#endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
-#if HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
const PartialInvTxfmParam dspr2_partial_idct_tests[] = {
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
&wrapper<vpx_idct32x32_1024_add_dspr2>, TX_32X32, 1024, 8, 1),
INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest,
::testing::ValuesIn(dspr2_partial_idct_tests));
-#endif // HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
-#if HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
// 32x32_135_ is implemented using the 1024 version.
const PartialInvTxfmParam msa_partial_idct_tests[] = {
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
INSTANTIATE_TEST_CASE_P(MSA, PartialIDctTest,
::testing::ValuesIn(msa_partial_idct_tests));
-#endif // HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#endif // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
+
+#endif // !CONFIG_EMULATE_HARDWARE
} // namespace
////////////////////////////////////////////////////////////////////////////////
-using ::std::tr1::get;
-using ::std::tr1::make_tuple;
-using ::std::tr1::tuple;
-
-template <typename SubpelVarianceFunctionType>
+template <typename FunctionType>
class SubpelVarianceTest
- : public ::testing::TestWithParam<
- tuple<int, int, SubpelVarianceFunctionType, int> > {
+ : public ::testing::TestWithParam<TestParams<FunctionType> > {
public:
virtual void SetUp() {
- const tuple<int, int, SubpelVarianceFunctionType, int> ¶ms =
- this->GetParam();
- log2width_ = get<0>(params);
- width_ = 1 << log2width_;
- log2height_ = get<1>(params);
- height_ = 1 << log2height_;
- subpel_variance_ = get<2>(params);
- if (get<3>(params)) {
- bit_depth_ = (vpx_bit_depth_t)get<3>(params);
- use_high_bit_depth_ = true;
- } else {
- bit_depth_ = VPX_BITS_8;
- use_high_bit_depth_ = false;
- }
- mask_ = (1 << bit_depth_) - 1;
+ params_ = this->GetParam();
rnd_.Reset(ACMRandom::DeterministicSeed());
- block_size_ = width_ * height_;
- if (!use_high_bit_depth_) {
- src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
- sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
- ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+ if (!use_high_bit_depth()) {
+ src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
+ sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
+ ref_ = new uint8_t[block_size() + width() + height() + 1];
#if CONFIG_VP9_HIGHBITDEPTH
} else {
src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
- vpx_memalign(16, block_size_ * sizeof(uint16_t))));
+ vpx_memalign(16, block_size() * sizeof(uint16_t))));
sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
- vpx_memalign(16, block_size_ * sizeof(uint16_t))));
- ref_ =
- CONVERT_TO_BYTEPTR(new uint16_t[block_size_ + width_ + height_ + 1]);
+ vpx_memalign(16, block_size() * sizeof(uint16_t))));
+ ref_ = CONVERT_TO_BYTEPTR(
+ new uint16_t[block_size() + width() + height() + 1]);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
ASSERT_TRUE(src_ != NULL);
}
virtual void TearDown() {
- if (!use_high_bit_depth_) {
+ if (!use_high_bit_depth()) {
vpx_free(src_);
delete[] ref_;
vpx_free(sec_);
uint8_t *src_;
uint8_t *ref_;
uint8_t *sec_;
- bool use_high_bit_depth_;
- vpx_bit_depth_t bit_depth_;
- int width_, log2width_;
- int height_, log2height_;
- int block_size_, mask_;
- SubpelVarianceFunctionType subpel_variance_;
+ TestParams<FunctionType> params_;
+
+ // some relay helpers
+ bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+ int byte_shift() const { return params_.bit_depth - 8; }
+ int block_size() const { return params_.block_size; }
+ int width() const { return params_.width; }
+ int height() const { return params_.height; }
+ uint32_t mask() const { return params_.mask; }
};
template <typename SubpelVarianceFunctionType>
void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
for (int x = 0; x < 8; ++x) {
for (int y = 0; y < 8; ++y) {
- if (!use_high_bit_depth_) {
- for (int j = 0; j < block_size_; j++) {
+ if (!use_high_bit_depth()) {
+ for (int j = 0; j < block_size(); j++) {
src_[j] = rnd_.Rand8();
}
- for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
ref_[j] = rnd_.Rand8();
}
#if CONFIG_VP9_HIGHBITDEPTH
} else {
- for (int j = 0; j < block_size_; j++) {
- CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+ for (int j = 0; j < block_size(); j++) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
}
- for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
- CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
}
#endif // CONFIG_VP9_HIGHBITDEPTH
}
unsigned int sse1, sse2;
unsigned int var1;
ASM_REGISTER_STATE_CHECK(
- var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
- const unsigned int var2 =
- subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
- use_high_bit_depth_, bit_depth_);
+ var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+ const unsigned int var2 = subpel_variance_ref(
+ ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+ use_high_bit_depth(), params_.bit_depth);
EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
}
// Ref: Set the first half of values to the maximum, the second half to 0.
for (int x = 0; x < 8; ++x) {
for (int y = 0; y < 8; ++y) {
- const int half = block_size_ / 2;
- if (!use_high_bit_depth_) {
+ const int half = block_size() / 2;
+ if (!use_high_bit_depth()) {
memset(src_, 0, half);
memset(src_ + half, 255, half);
memset(ref_, 255, half);
- memset(ref_ + half, 0, half + width_ + height_ + 1);
+ memset(ref_ + half, 0, half + width() + height() + 1);
#if CONFIG_VP9_HIGHBITDEPTH
} else {
- vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);
+ vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half);
vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
- vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,
- half + width_ + height_ + 1);
+ vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(),
+ half + width() + height() + 1);
#endif // CONFIG_VP9_HIGHBITDEPTH
}
unsigned int sse1, sse2;
unsigned int var1;
ASM_REGISTER_STATE_CHECK(
- var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
- const unsigned int var2 =
- subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
- use_high_bit_depth_, bit_depth_);
+ var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+ const unsigned int var2 = subpel_variance_ref(
+ ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+ use_high_bit_depth(), params_.bit_depth);
EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
}
void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
for (int x = 0; x < 8; ++x) {
for (int y = 0; y < 8; ++y) {
- if (!use_high_bit_depth_) {
- for (int j = 0; j < block_size_; j++) {
+ if (!use_high_bit_depth()) {
+ for (int j = 0; j < block_size(); j++) {
src_[j] = rnd_.Rand8();
sec_[j] = rnd_.Rand8();
}
- for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
ref_[j] = rnd_.Rand8();
}
#if CONFIG_VP9_HIGHBITDEPTH
} else {
- for (int j = 0; j < block_size_; j++) {
- CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
- CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_;
+ for (int j = 0; j < block_size(); j++) {
+ CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+ CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
}
- for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
- CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+ for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+ CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
}
#endif // CONFIG_VP9_HIGHBITDEPTH
}
uint32_t sse1, sse2;
uint32_t var1, var2;
- ASM_REGISTER_STATE_CHECK(var1 =
- subpel_variance_(ref_, width_ + 1, x, y,
- src_, width_, &sse1, sec_));
- var2 = subpel_avg_variance_ref(ref_, src_, sec_, log2width_, log2height_,
- x, y, &sse2, use_high_bit_depth_,
- static_cast<vpx_bit_depth_t>(bit_depth_));
+ ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
+ src_, width(), &sse1, sec_));
+ var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width,
+ params_.log2height, x, y, &sse2,
+ use_high_bit_depth(), params_.bit_depth);
EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
}
VarianceParams(2, 3, &vpx_variance4x8_c),
VarianceParams(2, 2, &vpx_variance4x4_c)));
+typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
INSTANTIATE_TEST_CASE_P(
C, VpxSubpelVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
- make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
- make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
- make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
- make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
- make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
- make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
- make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
- make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
- make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
- make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
- make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
- make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
-
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
+ SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
+ SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
+ SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
+ SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
+ SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
+ SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
+ SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
+ SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
+ SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
+ SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
+ SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
+
+typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
INSTANTIATE_TEST_CASE_P(
C, VpxSubpelAvgVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
- make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
- make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
- make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
- make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
- make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
- make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
- make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
- make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
- make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
- make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
- make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
- make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
+ ::testing::Values(
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
+ SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
+ SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
+ SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
+ SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
+ SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
+ SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
+ SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
+ SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
+ SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
+ SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
+ SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
#if CONFIG_VP9_HIGHBITDEPTH
typedef MainTestClass<VarianceMxNFunc> VpxHBDMseTest;
/* TODO(debargha): This test does not support the highbd version
INSTANTIATE_TEST_CASE_P(
C, VpxHBDMseTest,
- ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_c),
- make_tuple(4, 4, &vpx_highbd_12_mse16x8_c),
- make_tuple(4, 4, &vpx_highbd_12_mse8x16_c),
- make_tuple(4, 4, &vpx_highbd_12_mse8x8_c),
- make_tuple(4, 4, &vpx_highbd_10_mse16x16_c),
- make_tuple(4, 4, &vpx_highbd_10_mse16x8_c),
- make_tuple(4, 4, &vpx_highbd_10_mse8x16_c),
- make_tuple(4, 4, &vpx_highbd_10_mse8x8_c),
- make_tuple(4, 4, &vpx_highbd_8_mse16x16_c),
- make_tuple(4, 4, &vpx_highbd_8_mse16x8_c),
- make_tuple(4, 4, &vpx_highbd_8_mse8x16_c),
- make_tuple(4, 4, &vpx_highbd_8_mse8x8_c)));
+ ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c),
+ MseParams(4, 4, &vpx_highbd_12_mse16x8_c),
+ MseParams(4, 4, &vpx_highbd_12_mse8x16_c),
+ MseParams(4, 4, &vpx_highbd_12_mse8x8_c),
+ MseParams(4, 4, &vpx_highbd_10_mse16x16_c),
+ MseParams(4, 4, &vpx_highbd_10_mse16x8_c),
+ MseParams(4, 4, &vpx_highbd_10_mse8x16_c),
+ MseParams(4, 4, &vpx_highbd_10_mse8x8_c),
+ MseParams(4, 4, &vpx_highbd_8_mse16x16_c),
+ MseParams(4, 4, &vpx_highbd_8_mse16x8_c),
+ MseParams(4, 4, &vpx_highbd_8_mse8x16_c),
+ MseParams(4, 4, &vpx_highbd_8_mse8x8_c)));
*/
INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
C, VpxHBDSubpelVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
- make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
- make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
- make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
- make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
- make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
- make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
- make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
- make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
- make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
- make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
- make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
- make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
- make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10),
- make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10),
- make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10),
- make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10),
- make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10),
- make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10),
- make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10),
- make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
- make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
- make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
- make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
- make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
- make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
- make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12),
- make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12),
- make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12),
- make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12),
- make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12),
- make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12),
- make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12),
- make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
- make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
- make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
- make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
- make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
- make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12)));
+ SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
+ SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
+ SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
+ SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
+ SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
+ SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
+ SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
+ SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
+ SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
+ SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
+ SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
+ SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
+ SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
+ SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c,
+ 10),
+ SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c,
+ 10),
+ SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c,
+ 10),
+ SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c,
+ 10),
+ SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c,
+ 10),
+ SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c,
+ 10),
+ SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c,
+ 10),
+ SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
+ SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
+ SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
+ SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
+ SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
+ SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
+ SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c,
+ 12),
+ SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c,
+ 12),
+ SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c,
+ 12),
+ SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c,
+ 12),
+ SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c,
+ 12),
+ SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c,
+ 12),
+ SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c,
+ 12),
+ SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
+ SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
+ SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
+ SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
+ SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
+ SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c,
+ 12)));
INSTANTIATE_TEST_CASE_P(
C, VpxHBDSubpelAvgVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
- make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
- make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
- make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
- make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
- make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
- make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
- make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
- make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
- make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8),
- make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8),
- make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8),
- make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8),
- make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10),
- make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10),
- make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10),
- make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10),
- make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10),
- make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10),
- make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10),
- make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10),
- make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10),
- make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
- make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
- make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
- make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
- make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12),
- make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12),
- make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12),
- make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12),
- make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12),
- make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12),
- make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12),
- make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12),
- make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12),
- make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
- make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
- make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
- make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12)));
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+ SubpelAvgVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c,
+ 8),
+ SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c,
+ 8),
+ SubpelAvgVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c,
+ 8),
+ SubpelAvgVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c,
+ 8),
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_10_sub_pixel_avg_variance64x64_c,
+ 10),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance64x32_c,
+ 10),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_10_sub_pixel_avg_variance32x64_c,
+ 10),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance32x32_c,
+ 10),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance32x16_c,
+ 10),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance16x32_c,
+ 10),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance16x16_c,
+ 10),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_10_sub_pixel_avg_variance16x8_c,
+ 10),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance8x16_c,
+ 10),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+ SubpelAvgVarianceParams(2, 3,
+ &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+ SubpelAvgVarianceParams(2, 2,
+ &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_12_sub_pixel_avg_variance64x64_c,
+ 12),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance64x32_c,
+ 12),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_12_sub_pixel_avg_variance32x64_c,
+ 12),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance32x32_c,
+ 12),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance32x16_c,
+ 12),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance16x32_c,
+ 12),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance16x16_c,
+ 12),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_12_sub_pixel_avg_variance16x8_c,
+ 12),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance8x16_c,
+ 12),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+ SubpelAvgVarianceParams(2, 3,
+ &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+ SubpelAvgVarianceParams(2, 2,
+ &vpx_highbd_12_sub_pixel_avg_variance4x4_c,
+ 12)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, VpxSubpelVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
- make_tuple(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
- make_tuple(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
- make_tuple(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
- make_tuple(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
- make_tuple(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
- make_tuple(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
- make_tuple(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
- make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
- make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
- make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
- make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
- make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
+ SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
+ SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
+ SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
+ SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
+ SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
+ SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
+ SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
+ SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
+ SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
+ SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
+ SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
INSTANTIATE_TEST_CASE_P(
SSE2, VpxSubpelAvgVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
- make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
- make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
- make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
- make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
- make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
- make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
- make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
- make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
- make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
- make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
- make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
- make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
+ SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
+ SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
+ SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
+ SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
+ SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
+ SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
+ SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
+ SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
+ SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
+ SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
+ SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
#if CONFIG_VP9_HIGHBITDEPTH
/* TODO(debargha): This test does not support the highbd version
INSTANTIATE_TEST_CASE_P(
SSE2, VpxHBDSubpelVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, 12),
- make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, 12),
- make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, 12),
- make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, 12),
- make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, 12),
- make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, 12),
- make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, 12),
- make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, 12),
- make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, 12),
- make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, 12),
- make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, 12),
- make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, 10),
- make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, 10),
- make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, 10),
- make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, 10),
- make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, 10),
- make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, 10),
- make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, 10),
- make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, 10),
- make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, 10),
- make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, 10),
- make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, 10),
- make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, 8),
- make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, 8),
- make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, 8),
- make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, 8),
- make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, 8),
- make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, 8),
- make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, 8),
- make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, 8),
- make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, 8),
- make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
- make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, 8)));
+ SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2,
+ 12),
+ SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2,
+ 12),
+ SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2,
+ 12),
+ SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2,
+ 12),
+ SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2,
+ 12),
+ SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2,
+ 12),
+ SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2,
+ 12),
+ SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2,
+ 12),
+ SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2,
+ 12),
+ SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2,
+ 12),
+ SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2,
+ 12),
+ SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2,
+ 10),
+ SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2,
+ 10),
+ SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2,
+ 10),
+ SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2,
+ 10),
+ SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2,
+ 10),
+ SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2,
+ 10),
+ SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2,
+ 10),
+ SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2,
+ 10),
+ SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2,
+ 10),
+ SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2,
+ 10),
+ SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2,
+ 10),
+ SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2,
+ 8),
+ SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2,
+ 8),
+ SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2,
+ 8),
+ SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2,
+ 8),
+ SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2,
+ 8),
+ SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2,
+ 8),
+ SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2,
+ 8),
+ SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2,
+ 8),
+ SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2,
+ 8),
+ SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
+ SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2,
+ 8)));
INSTANTIATE_TEST_CASE_P(
SSE2, VpxHBDSubpelAvgVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, 12),
- make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, 12),
- make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, 12),
- make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, 12),
- make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, 12),
- make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, 12),
- make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, 12),
- make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, 12),
- make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, 12),
- make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, 12),
- make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, 12),
- make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, 10),
- make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, 10),
- make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, 10),
- make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, 10),
- make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, 10),
- make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, 10),
- make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, 10),
- make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, 10),
- make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, 10),
- make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, 10),
- make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, 10),
- make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, 8),
- make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, 8),
- make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, 8),
- make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, 8),
- make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, 8),
- make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, 8),
- make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, 8),
- make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, 8),
- make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8),
- make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8),
- make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8)));
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2,
+ 12),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2,
+ 12),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2,
+ 12),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2,
+ 12),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2,
+ 12),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2,
+ 12),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2,
+ 12),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2,
+ 12),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2,
+ 12),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2,
+ 12),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2,
+ 12),
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2,
+ 10),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2,
+ 10),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2,
+ 10),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2,
+ 10),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2,
+ 10),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2,
+ 10),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2,
+ 10),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2,
+ 10),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2,
+ 10),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2,
+ 10),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2,
+ 10),
+ SubpelAvgVarianceParams(6, 6,
+ &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2,
+ 8),
+ SubpelAvgVarianceParams(6, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2,
+ 8),
+ SubpelAvgVarianceParams(5, 6,
+ &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2,
+ 8),
+ SubpelAvgVarianceParams(5, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2,
+ 8),
+ SubpelAvgVarianceParams(5, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2,
+ 8),
+ SubpelAvgVarianceParams(4, 5,
+ &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2,
+ 8),
+ SubpelAvgVarianceParams(4, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2,
+ 8),
+ SubpelAvgVarianceParams(4, 3,
+ &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2,
+ 8),
+ SubpelAvgVarianceParams(3, 4,
+ &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2,
+ 8),
+ SubpelAvgVarianceParams(3, 3,
+ &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2,
+ 8),
+ SubpelAvgVarianceParams(3, 2,
+ &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2,
+ 8)));
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // HAVE_SSE2
#if HAVE_SSSE3
INSTANTIATE_TEST_CASE_P(
SSSE3, VpxSubpelVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
- make_tuple(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
- make_tuple(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
- make_tuple(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
- make_tuple(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
- make_tuple(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
- make_tuple(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
- make_tuple(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
- make_tuple(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
- make_tuple(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
- make_tuple(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
- make_tuple(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
- make_tuple(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
+ SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
+ SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
+ SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
+ SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
+ SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
+ SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
+ SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
+ SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
+ SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
+ SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
+ SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
INSTANTIATE_TEST_CASE_P(
SSSE3, VpxSubpelAvgVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, 0),
- make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, 0),
- make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, 0),
- make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, 0),
- make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, 0),
- make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, 0),
- make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, 0),
- make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
- make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
- make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
- make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
- make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
- make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0)));
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3,
+ 0),
+ SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3,
+ 0),
+ SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3,
+ 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3,
+ 0),
+ SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3,
+ 0),
+ SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3,
+ 0),
+ SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3,
+ 0),
+ SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
+ SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
+ SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
+ SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
+ SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
+ SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3,
+ 0)));
#endif // HAVE_SSSE3
#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(
AVX2, VpxSubpelVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
- make_tuple(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
INSTANTIATE_TEST_CASE_P(
AVX2, VpxSubpelAvgVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
- make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0)));
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2,
+ 0)));
#endif // HAVE_AVX2
#if HAVE_NEON
INSTANTIATE_TEST_CASE_P(
NEON, VpxSubpelVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
- make_tuple(6, 5, &vpx_sub_pixel_variance64x32_neon, 0),
- make_tuple(5, 6, &vpx_sub_pixel_variance32x64_neon, 0),
- make_tuple(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
- make_tuple(5, 4, &vpx_sub_pixel_variance32x16_neon, 0),
- make_tuple(4, 5, &vpx_sub_pixel_variance16x32_neon, 0),
- make_tuple(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
- make_tuple(4, 3, &vpx_sub_pixel_variance16x8_neon, 0),
- make_tuple(3, 4, &vpx_sub_pixel_variance8x16_neon, 0),
- make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0),
- make_tuple(3, 2, &vpx_sub_pixel_variance8x4_neon, 0),
- make_tuple(2, 3, &vpx_sub_pixel_variance4x8_neon, 0),
- make_tuple(2, 2, &vpx_sub_pixel_variance4x4_neon, 0)));
+ ::testing::Values(
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
+ SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_neon, 0),
+ SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_neon, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
+ SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_neon, 0),
+ SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_neon, 0),
+ SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
+ SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_neon, 0),
+ SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_neon, 0),
+ SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_neon, 0),
+ SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_neon, 0),
+ SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_neon, 0),
+ SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_neon, 0)));
INSTANTIATE_TEST_CASE_P(
NEON, VpxSubpelAvgVarianceTest,
::testing::Values(
- make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0),
- make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0),
- make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0),
- make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0),
- make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0),
- make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0),
- make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0),
- make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0),
- make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0),
- make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0),
- make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
- make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
- make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0),
+ SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0),
+ SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0),
+ SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0),
+ SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0),
+ SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0),
+ SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0),
+ SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0),
+ SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0),
+ SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
+ SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
+ SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
#endif // HAVE_NEON
#if HAVE_MSA
INSTANTIATE_TEST_CASE_P(
MSA, VpxSubpelVarianceTest,
- ::testing::Values(make_tuple(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
- make_tuple(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
- make_tuple(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
- make_tuple(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
- make_tuple(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
- make_tuple(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
- make_tuple(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
- make_tuple(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
- make_tuple(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
- make_tuple(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
- make_tuple(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
- make_tuple(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
- make_tuple(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
+ ::testing::Values(
+ SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
+ SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
+ SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
+ SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
+ SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
+ SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
+ SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
+ SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
+ SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
+ SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
+ SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
+ SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
+ SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
INSTANTIATE_TEST_CASE_P(
MSA, VpxSubpelAvgVarianceTest,
- ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
- make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
- make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
- make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
- make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
- make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
- make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
- make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
- make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
- make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
- make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
- make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
- make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
+ ::testing::Values(
+ SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
+ SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
+ SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
+ SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
+ SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
+ SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
+ SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
+ SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
+ SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
+ SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
+ SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
+ SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
+ SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
#endif // HAVE_MSA
#if HAVE_VSX
URL: https://chromium.googlesource.com/webm/libwebm
-Version: 9732ae991efb71aced4267d4794918279e362d99
+Version: a97c484bfd6b5de4b1b61efe33089b55d810b412
License: BSD
License File: LICENSE.txt
int chroma_subsampling;
};
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic pop
+#endif
bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
PrimaryChromaticityPtr* muxer_pc);
#include "mkvmuxer/mkvwriter.h"
#include "mkvparser/mkvparser.h"
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
namespace mkvmuxer {
const float PrimaryChromaticity::kChromaticityMin = 0.0f;
output_cues_(true),
accurate_cluster_duration_(false),
fixed_size_cluster_timecode_(false),
- estimate_file_duration_(true),
+ estimate_file_duration_(false),
payload_pos_(0),
size_position_(0),
doc_type_version_(kDefaultDocTypeVersion),
track->set_width(width);
track->set_height(height);
- tracks_.AddTrack(track, number);
+ if (!tracks_.AddTrack(track, number)) {
+ delete track;
+ return 0;
+ }
has_video_ = true;
return track->number();
cue->set_block_number(cluster->blocks_added());
cue->set_cluster_pos(cluster->position_for_cues());
cue->set_track(track);
- if (!cues_.AddCue(cue))
+ if (!cues_.AddCue(cue)) {
+ delete cue;
return false;
+ }
new_cuepoint_ = false;
return true;
track->set_sample_rate(sample_rate);
track->set_channels(channels);
- tracks_.AddTrack(track, number);
+ if (!tracks_.AddTrack(track, number)) {
+ delete track;
+ return 0;
+ }
return track->number();
}
if (frame->discard_padding() != 0)
doc_type_version_ = 4;
+ if (cluster_list_size_ > 0) {
+ const uint64_t timecode_scale = segment_info_.timecode_scale();
+ const uint64_t frame_timecode = frame->timestamp() / timecode_scale;
+
+ const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+ const uint64_t last_cluster_timecode = last_cluster->timecode();
+
+ const uint64_t rel_timecode = frame_timecode - last_cluster_timecode;
+ if (rel_timecode > kMaxBlockTimecode) {
+ force_new_cluster_ = true;
+ }
+ }
+
// If the segment has a video track hold onto audio frames to make sure the
// audio that is associated with the start time of a video key-frame is
// muxed into the same cluster.
if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) &&
!force_new_cluster_) {
Frame* const new_frame = new (std::nothrow) Frame();
- if (!new_frame || !new_frame->CopyFrom(*frame))
+ if (!new_frame || !new_frame->CopyFrom(*frame)) {
+ delete new_frame;
return false;
- if (!QueueFrame(new_frame))
+ }
+ if (!QueueFrame(new_frame)) {
+ delete new_frame;
return false;
+ }
track_frames_written_[frame->track_number() - 1]++;
return true;
}
if (!frame->CanBeSimpleBlock() && !frame->is_key() &&
!frame->reference_block_timestamp_set()) {
Frame* const new_frame = new (std::nothrow) Frame();
- if (!new_frame->CopyFrom(*frame))
+ if (!new_frame || !new_frame->CopyFrom(*frame)) {
+ delete new_frame;
return false;
+ }
new_frame->set_reference_block_timestamp(
last_track_timestamp_[frame->track_number() - 1]);
frame = new_frame;
ebml_size += strlen(value);
// Size of Datasize
- ebml_size++;
+ ebml_size += GetCodedUIntSize(strlen(value));
return ebml_size;
}
#include "mkvmuxer/mkvwriter.h"
+#include <sys/types.h>
+
#ifdef _MSC_VER
#include <share.h> // for _SH_DENYWR
#endif
#include "common/webmids.h"
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
namespace mkvparser {
const float MasteringMetadata::kValueNotPresent = FLT_MAX;
const long long Colour::kValueNotPresent = LLONG_MAX;
if (pos != stop)
return E_FILE_FORMAT_INVALID;
- m_entries = new (std::nothrow) Entry[entry_count];
+ if (entry_count > 0) {
+ m_entries = new (std::nothrow) Entry[entry_count];
- if (m_entries == NULL)
- return -1;
+ if (m_entries == NULL)
+ return -1;
+ }
- m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+ if (void_element_count > 0) {
+ m_void_elements = new (std::nothrow) VoidElement[void_element_count];
- if (m_void_elements == NULL)
- return -1;
+ if (m_void_elements == NULL)
+ return -1;
+ }
// now parse the entries and void elements
if (status < 0) // error
return status;
- if (id == libwebm::kMkvSeek) {
+ if (id == libwebm::kMkvSeek && entry_count > 0) {
if (ParseEntry(pReader, pos, size, pEntry)) {
Entry& e = *pEntry++;
e.element_start = idpos;
e.element_size = (pos + size) - idpos;
}
- } else if (id == libwebm::kMkvVoid) {
+ } else if (id == libwebm::kMkvVoid && void_element_count > 0) {
VoidElement& e = *pVoidElement++;
e.element_start = idpos;
}
const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const {
- assert(pTrack);
+ if (pTrack == NULL) {
+ return NULL;
+ }
const long long n = pTrack->GetNumber();
}
const double rollover_check = m_duration * m_timecodeScale;
- if (rollover_check > LLONG_MAX)
+ if (rollover_check > static_cast<double>(LLONG_MAX))
return E_FILE_FORMAT_INVALID;
if (pos != stop)
if (!reader)
return false;
- std::auto_ptr<PrimaryChromaticity> chromaticity_ptr;
-
- if (!*chromaticity) {
- chromaticity_ptr.reset(new PrimaryChromaticity());
- } else {
- chromaticity_ptr.reset(*chromaticity);
- }
+ if (!*chromaticity)
+ *chromaticity = new PrimaryChromaticity();
- if (!chromaticity_ptr.get())
+ if (!*chromaticity)
return false;
- float* value = is_x ? &chromaticity_ptr->x : &chromaticity_ptr->y;
+ PrimaryChromaticity* pc = *chromaticity;
+ float* value = is_x ? &pc->x : &pc->y;
double parser_value = 0;
- const long long value_parse_status =
+ const long long parse_status =
UnserializeFloat(reader, read_pos, value_size, parser_value);
- *value = static_cast<float>(parser_value);
-
- if (value_parse_status < 0 || *value < 0.0 || *value > 1.0)
+ // Valid range is [0, 1]. Make sure the double is representable as a float
+ // before casting.
+ if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 ||
+ (parser_value > 0.0 && parser_value < FLT_MIN))
return false;
- *chromaticity = chromaticity_ptr.release();
+ *value = static_cast<float>(parser_value);
+
return true;
}
double value = 0;
const long long value_parse_status =
UnserializeFloat(reader, read_pos, child_size, value);
- if (value_parse_status < 0) {
+ // Make sure value is representable as a float before casting.
+ if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX ||
+ (value > 0.0 && value < FLT_MIN)) {
return false;
}
pf = m_frames;
while (pf != pf_end) {
Frame& f = *pf++;
- assert((pos + f.len) <= stop);
if ((pos + f.len) > stop)
return E_FILE_FORMAT_INVALID;
// be found in the AUTHORS file in the root of the source tree.
#include "mkvparser/mkvreader.h"
+#include <sys/types.h>
+
#include <cassert>
namespace mkvparser {
#include "vpx/vpx_integer.h"
#include "./y4minput.h"
-void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
- uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
- uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+static void ssim_parms_8x8(unsigned char *s, int sp, unsigned char *r, int rp,
+ uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+ uint32_t *sum_sq_r, uint32_t *sum_sxr) {
int i, j;
for (i = 0; i < 8; i++, s += sp, r += rp) {
for (j = 0; j < 8; j++) {
static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
- vp8_ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
- &sum_sxr);
+ ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
}
// We are using a 8x8 moving window with starting location of each 8x8 window
// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
// block boundaries to penalize blocking artifacts.
-double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
- int stride_img2, int width, int height) {
+static double ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
+ int stride_img2, int width, int height) {
int i, j;
int samples = 0;
double ssim_total = 0;
}
#define MAX_PSNR 100
-double vp9_mse2psnr(double samples, double peak, double mse) {
+static double mse2psnr(double samples, double peak, double mse) {
double psnr;
if (mse > 0.0)
} input_file_t;
// Open a file and determine if its y4m or raw. If y4m get the header.
-int open_input_file(const char *file_name, input_file_t *input, int w, int h) {
+static int open_input_file(const char *file_name, input_file_t *input, int w,
+ int h) {
char y4m_buf[4];
size_t r1;
input->type = RAW_YUV;
return 0;
}
-void close_input_file(input_file_t *in) {
+static void close_input_file(input_file_t *in) {
if (in->file) fclose(in->file);
if (in->type == Y4M) {
vpx_img_free(&in->img);
}
}
-size_t read_input_file(input_file_t *in, unsigned char **y, unsigned char **u,
- unsigned char **v) {
+static size_t read_input_file(input_file_t *in, unsigned char **y,
+ unsigned char **u, unsigned char **v) {
size_t r1 = 0;
switch (in->type) {
case Y4M:
break;
}
#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
- ssim = vp8_ssim2(buf0, buf1, w, w, w, h); \
+ ssim = ssim2(buf0, buf1, w, w, w, h); \
psnr = calc_plane_error(buf0, w, buf1, w, w, h);
if (n_frames == allocated_frames) {
ssimuavg += ssimu[i];
ssimvavg += ssimv[i];
- frame_psnr = vp9_mse2psnr(w * h * 6 / 4, 255.0,
- (double)psnry[i] + psnru[i] + psnrv[i]);
- frame_psnry = vp9_mse2psnr(w * h * 4 / 4, 255.0, (double)psnry[i]);
- frame_psnru = vp9_mse2psnr(w * h * 1 / 4, 255.0, (double)psnru[i]);
- frame_psnrv = vp9_mse2psnr(w * h * 1 / 4, 255.0, (double)psnrv[i]);
+ frame_psnr =
+ mse2psnr(w * h * 6 / 4, 255.0, (double)psnry[i] + psnru[i] + psnrv[i]);
+ frame_psnry = mse2psnr(w * h * 4 / 4, 255.0, (double)psnry[i]);
+ frame_psnru = mse2psnr(w * h * 1 / 4, 255.0, (double)psnru[i]);
+ frame_psnrv = mse2psnr(w * h * 1 / 4, 255.0, (double)psnrv[i]);
psnravg += frame_psnr;
psnryavg += frame_psnry;
puts("");
psnrglb = psnryglb + psnruglb + psnrvglb;
- psnrglb = vp9_mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb);
- psnryglb = vp9_mse2psnr((double)n_frames * w * h * 4 / 4, 255.0, psnryglb);
- psnruglb = vp9_mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnruglb);
- psnrvglb = vp9_mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnrvglb);
+ psnrglb = mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb);
+ psnryglb = mse2psnr((double)n_frames * w * h * 4 / 4, 255.0, psnryglb);
+ psnruglb = mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnruglb);
+ psnrvglb = mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnrvglb);
printf("GlbPSNR: %lf\n", psnrglb);
printf("GlbPSNR-Y: %lf\n", psnryglb);
u = vp8_signed_char_clamp(ps1 + filter_value);
*op1 = u ^ 0x80;
}
-void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh, int count) {
+
+static void loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
int hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
} while (++i < count * 8);
}
-void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh, int count) {
+static void loop_filter_vertical_edge_c(unsigned char *s, int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
int hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
*op2 = s ^ 0x80;
}
-void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh,
- int count) {
+static void mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
signed char hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
} while (++i < count * 8);
}
-void vp8_mbloop_filter_vertical_edge_c(unsigned char *s, int p,
- const unsigned char *blimit,
- const unsigned char *limit,
- const unsigned char *thresh, int count) {
+static void mbloop_filter_vertical_edge_c(unsigned char *s, int p,
+ const unsigned char *blimit,
+ const unsigned char *limit,
+ const unsigned char *thresh,
+ int count) {
signed char hev = 0; /* high edge variance */
signed char mask = 0;
int i = 0;
void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
- vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 2);
+ mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
if (u_ptr) {
- vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
}
if (v_ptr) {
- vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
}
}
void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
- vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 2);
+ mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 2);
if (u_ptr) {
- vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
}
if (v_ptr) {
- vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
- lfi->hev_thr, 1);
+ mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+ lfi->hev_thr, 1);
}
}
void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
- vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
- lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
- lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
- lfi->lim, lfi->hev_thr, 2);
+ loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+ loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
+ loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 2);
if (u_ptr) {
- vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 1);
+ loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 1);
}
if (v_ptr) {
- vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
- lfi->blim, lfi->lim, lfi->hev_thr, 1);
+ loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+ lfi->lim, lfi->hev_thr, 1);
}
}
void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
unsigned char *v_ptr, int y_stride, int uv_stride,
loop_filter_info *lfi) {
- vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 2);
- vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 2);
- vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 2);
+ loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+ loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
+ loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 2);
if (u_ptr) {
- vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 1);
+ loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 1);
}
if (v_ptr) {
- vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
- lfi->hev_thr, 1);
+ loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+ lfi->hev_thr, 1);
}
}
#include "vpx/vp8.h"
struct VP8D_COMP;
+struct VP8Common;
typedef struct {
int Width;
int vp8dx_get_raw_frame(struct VP8D_COMP *comp, YV12_BUFFER_CONFIG *sd,
int64_t *time_stamp, int64_t *time_end_stamp,
vp8_ppflags_t *flags);
+int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame);
vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp,
enum vpx_ref_frame_type ref_frame_flag,
VP8_COMMON *const cm = &cpi->common;
uint8_t *y;
const uint8_t *src_y = cpi->Source->y_buffer;
- const uint8_t *src_u = cpi->Source->u_buffer;
- const uint8_t *src_v = cpi->Source->v_buffer;
const int src_ystride = cpi->Source->y_stride;
- const int src_uvstride = cpi->Source->uv_stride;
+ int offset = 0;
YV12_BUFFER_CONFIG skinmap;
memset(&skinmap, 0, sizeof(skinmap));
y = skinmap.y_buffer;
// Loop through blocks and set skin map based on center pixel of block.
// Set y to white for skin block, otherwise set to source with gray scale.
- // Ignore rightmost/bottom boundary blocks.
for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) {
num_bl = 0;
for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) {
- int is_skin = 0;
- int consec_zeromv = 0;
- const int bl_index = mb_row * cm->mb_cols + mb_col;
- const int bl_index1 = bl_index + 1;
- const int bl_index2 = bl_index + cm->mb_cols;
- const int bl_index3 = bl_index2 + 1;
- consec_zeromv = VPXMIN(cpi->consec_zero_last[bl_index],
- VPXMIN(cpi->consec_zero_last[bl_index1],
- VPXMIN(cpi->consec_zero_last[bl_index2],
- cpi->consec_zero_last[bl_index3])));
- is_skin =
- vp8_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
- SKIN_8X8, consec_zeromv, 0);
+ const int is_skin = cpi->skin_map[offset++];
for (i = 0; i < 16; i++) {
for (j = 0; j < 16; j++) {
- if (is_skin)
- y[i * src_ystride + j] = 255;
- else
- y[i * src_ystride + j] = src_y[i * src_ystride + j];
+ y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j];
}
}
num_bl++;
y += 16;
src_y += 16;
- src_u += 8;
- src_v += 8;
}
y += (src_ystride << 4) - (num_bl << 4);
src_y += (src_ystride << 4) - (num_bl << 4);
- src_u += (src_uvstride << 3) - (num_bl << 3);
- src_v += (src_uvstride << 3) - (num_bl << 3);
}
vpx_write_yuv_frame(yuv_skinmap_file, &skinmap);
vpx_free_frame_buffer(&skinmap);
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "decodemv.h"
#include "treereader.h"
#include "vp8/common/entropymv.h"
#include "vp8/common/entropymode.h"
#endif
extern void vp8_init_loop_filter(VP8_COMMON *cm);
-extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
static int get_free_fb(VP8_COMMON *cm);
static void ref_cnt_fb(int *buf, int *idx, int new_idx);
void *decrypt_state;
} VP8D_COMP;
+void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
int vp8_decode_frame(VP8D_COMP *cpi);
int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
#include "vp8/common/loopfilter.h"
#include "vp8/common/extend.h"
#include "vpx_ports/vpx_timer.h"
+#include "decoderthreading.h"
#include "detokenize.h"
#include "vp8/common/reconintra4x4.h"
#include "vp8/common/reconinter.h"
memset((p), 0, (n) * sizeof(*(p))); \
} while (0)
-void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
-
static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
MB_ROW_DEC *mbrd, int count) {
VP8_COMMON *const pc = &pbi->common;
extern "C" {
#endif
+#include "vp8/encoder/treewriter.h"
+#include "vp8/encoder/tokenize.h"
+
void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount);
+void vp8_convert_rfct_to_prob(struct VP8_COMP *const cpi);
+void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra,
+ int prob_last, int prob_garf);
+int vp8_estimate_entropy_savings(struct VP8_COMP *cpi);
+void vp8_update_coef_probs(struct VP8_COMP *cpi);
#ifdef __cplusplus
} // extern "C"
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
+#include "bitstream.h"
#include "encodemb.h"
#include "encodemv.h"
+#if CONFIG_MULTITHREAD
+#include "ethreading.h"
+#endif
#include "vp8/common/common.h"
#include "onyx_int.h"
#include "vp8/common/extend.h"
#include "encodeframe.h"
extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
-extern void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra,
- int prob_last, int prob_garf);
-extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi);
-extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
-extern void vp8_auto_select_speed(VP8_COMP *cpi);
-extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
- MB_ROW_COMP *mbr_ei, int count);
static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
#ifdef MODE_STATS
#ifndef VP8_ENCODER_ENCODEFRAME_H_
#define VP8_ENCODER_ENCODEFRAME_H_
+#include "vp8/encoder/tokenize.h"
+
#ifdef __cplusplus
extern "C" {
#endif
-extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
-extern void vp8_build_block_offsets(MACROBLOCK *x);
+struct VP8_COMP;
+struct macroblock;
+
+void vp8_activity_masking(struct VP8_COMP *cpi, MACROBLOCK *x);
+
+void vp8_build_block_offsets(struct macroblock *x);
-extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+void vp8_setup_block_ptrs(struct macroblock *x);
-extern void vp8_encode_frame(VP8_COMP *cpi);
+void vp8_encode_frame(struct VP8_COMP *cpi);
-extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
- TOKENEXTRA **t, int recon_yoffset,
- int recon_uvoffset, int mb_row,
- int mb_col);
+int vp8cx_encode_inter_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
+ TOKENEXTRA **t, int recon_yoffset,
+ int recon_uvoffset, int mb_row, int mb_col);
-extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
- TOKENEXTRA **t);
+int vp8cx_encode_intra_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
+ TOKENEXTRA **t);
#ifdef __cplusplus
} // extern "C"
#endif
#include "vp8/common/extend.h"
#include "bitstream.h"
#include "encodeframe.h"
+#include "ethreading.h"
#if CONFIG_MULTITHREAD
--- /dev/null
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_ETHREADING_H_
+#define VP8_ENCODER_ETHREADING_H_
+
+#include "vp8/encoder/onyx_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct macroblock;
+
+void vp8cx_init_mbrthread_data(struct VP8_COMP *cpi, struct macroblock *x,
+ MB_ROW_COMP *mbr_ei, int count);
+int vp8cx_create_encoder_threads(struct VP8_COMP *cpi);
+void vp8cx_remove_encoder_threads(struct VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // VP8_ENCODER_ETHREADING_H_
#include "./vpx_scale_rtcd.h"
#include "./vpx_dsp_rtcd.h"
#include "./vp8_rtcd.h"
+#include "bitstream.h"
#include "vp8/common/onyxc_int.h"
#include "vp8/common/blockd.h"
#include "onyx_int.h"
#include "mr_dissim.h"
#endif
#include "encodeframe.h"
+#if CONFIG_MULTITHREAD
+#include "ethreading.h"
+#endif
+#include "picklpf.h"
+#if !CONFIG_REALTIME_ONLY
+#include "temporal_filter.h"
+#endif
#include <assert.h>
#include <math.h>
#if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
extern int vp8_update_coef_context(VP8_COMP *cpi);
-extern void vp8_update_coef_probs(VP8_COMP *cpi);
#endif
-extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
-extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
-extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
-
extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source,
YV12_BUFFER_CONFIG *post, int filt_lvl,
int low_var_thresh, int flag);
extern void print_parms(VP8_CONFIG *ocf, char *filenam);
extern unsigned int vp8_get_processor_freq();
extern void print_tree_update_probs();
-extern int vp8cx_create_encoder_threads(VP8_COMP *cpi);
-extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi);
-
-int vp8_estimate_entropy_savings(VP8_COMP *cpi);
int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
-extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
-
static void set_default_lf_deltas(VP8_COMP *cpi);
extern const int vp8_gf_interval_table[101];
set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
}
+static void compute_skin_map(VP8_COMP *cpi) {
+ int mb_row, mb_col, num_bl;
+ VP8_COMMON *cm = &cpi->common;
+ const uint8_t *src_y = cpi->Source->y_buffer;
+ const uint8_t *src_u = cpi->Source->u_buffer;
+ const uint8_t *src_v = cpi->Source->v_buffer;
+ const int src_ystride = cpi->Source->y_stride;
+ const int src_uvstride = cpi->Source->uv_stride;
+
+ const SKIN_DETECTION_BLOCK_SIZE bsize =
+ (cm->Width * cm->Height <= 352 * 288) ? SKIN_8X8 : SKIN_16X16;
+ int offset = 0;
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+ num_bl = 0;
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+ const int bl_index = mb_row * cm->mb_cols + mb_col;
+ cpi->skin_map[offset] =
+ vp8_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
+ bsize, cpi->consec_zero_last[bl_index], 0);
+ num_bl++;
+ offset++;
+ src_y += 16;
+ src_u += 8;
+ src_v += 8;
+ }
+ src_y += (src_ystride << 4) - (num_bl << 4);
+ src_u += (src_uvstride << 3) - (num_bl << 3);
+ src_v += (src_uvstride << 3) - (num_bl << 3);
+ }
+}
+
static void set_default_lf_deltas(VP8_COMP *cpi) {
cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
cpi->cyclic_refresh_map = (signed char *)NULL;
}
+ CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
+ sizeof(cpi->skin_map[0])));
+
CHECK_MEM_ERROR(cpi->consec_zero_last,
vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
yuv_denoised_file = fopen("denoised.yuv", "ab");
#endif
#ifdef OUTPUT_YUV_SKINMAP
- yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+ yuv_skinmap_file = fopen("skinmap.yuv", "wb");
#endif
#if 0
dealloc_compressor_data(cpi);
vpx_free(cpi->mb.ss);
vpx_free(cpi->tok);
+ vpx_free(cpi->skin_map);
vpx_free(cpi->cyclic_refresh_map);
vpx_free(cpi->consec_zero_last);
vpx_free(cpi->consec_zero_last_mvbias);
}
#endif
+ compute_skin_map(cpi);
+
/* Setup background Q adjustment for error resilient mode.
* For multi-layer encodes only enable this for the base layer.
*/
int zeromv_count;
int lf_zeromv_pct;
+ unsigned char *skin_map;
+
unsigned char *segmentation_map;
signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
int segment_encode_breakout[MAX_MB_SEGMENTS];
x->is_skin = 0;
if (!cpi->oxcf.screen_content_mode) {
int block_index = mb_row * cpi->common.mb_cols + mb_col;
- x->is_skin = vp8_compute_skin_block(
- x->src.y_buffer, x->src.u_buffer, x->src.v_buffer, x->src.y_stride,
- x->src.uv_stride, SKIN_16X16, cpi->consec_zero_last[block_index], 0);
+ x->is_skin = cpi->skin_map[block_index];
}
#if CONFIG_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity) {
#include "./vpx_scale_rtcd.h"
#include "vp8/common/onyxc_int.h"
#include "onyx_int.h"
+#include "vp8/encoder/picklpf.h"
#include "vp8/encoder/quantize.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/vpx_scale.h"
--- /dev/null
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_PICKLPF_H_
+#define VP8_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct yv12_buffer_config;
+
+void vp8cx_pick_filter_level_fast(struct yv12_buffer_config *sd,
+ struct VP8_COMP *cpi);
+void vp8cx_set_alt_lf_level(struct VP8_COMP *cpi, int filt_val);
+void vp8cx_pick_filter_level(struct yv12_buffer_config *sd, VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // VP8_ENCODER_PICKLPF_H_
#include "vpx_config.h"
#include "vp8_rtcd.h"
#include "./vpx_dsp_rtcd.h"
+#include "encodeframe.h"
#include "tokenize.h"
#include "treewriter.h"
#include "onyx_int.h"
#define RDCOST(RM, DM, R, D) (((128 + (R) * (RM)) >> 8) + (DM) * (D))
+void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
+void vp8_auto_select_speed(VP8_COMP *cpi);
+
static INLINE void insertsortmv(int arr[], int len) {
int i, j, k;
#include "ratectrl.h"
#include "vp8/common/quant_common.h"
#include "segmentation.h"
+#include "temporal_filter.h"
#include "vpx_mem/vpx_mem.h"
#include "vp8/common/swapyv12buffer.h"
#include "vp8/common/threading.h"
--- /dev/null
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_TEMPORAL_FILTER_H_
+#define VP8_ENCODER_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+void vp8_temporal_filter_prepare_c(struct VP8_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // VP8_ENCODER_TEMPORAL_FILTER_H_
+++ /dev/null
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-; short *qcoeff_ptr,short *dequant_ptr,
-; short *scan_mask, short *round_ptr,
-; short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE
-sym(vp8_fast_quantize_b_impl_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;coeff_ptr
- movq mm0, [rsi]
-
- mov rax, arg(1) ;zbin_ptr
- movq mm1, [rax]
-
- movq mm3, mm0
- psraw mm0, 15
-
- pxor mm3, mm0
- psubw mm3, mm0 ; abs
-
- movq mm2, mm3
- pcmpgtw mm1, mm2
-
- pandn mm1, mm2
- movq mm3, mm1
-
- mov rdx, arg(6) ;quant_ptr
- movq mm1, [rdx]
-
- mov rcx, arg(5) ;round_ptr
- movq mm2, [rcx]
-
- paddw mm3, mm2
- pmulhuw mm3, mm1
-
- pxor mm3, mm0
- psubw mm3, mm0 ;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
- movq mm0, mm3
-
- movq [rdi], mm3
-
- mov rax, arg(3) ;dequant_ptr
- movq mm2, [rax]
-
- pmullw mm3, mm2
- mov rax, arg(7) ;dqcoeff_ptr
-
- movq [rax], mm3
-
- ; next 8
- movq mm4, [rsi+8]
-
- mov rax, arg(1) ;zbin_ptr
- movq mm5, [rax+8]
-
- movq mm7, mm4
- psraw mm4, 15
-
- pxor mm7, mm4
- psubw mm7, mm4 ; abs
-
- movq mm6, mm7
- pcmpgtw mm5, mm6
-
- pandn mm5, mm6
- movq mm7, mm5
-
- movq mm5, [rdx+8]
- movq mm6, [rcx+8]
-
- paddw mm7, mm6
- pmulhuw mm7, mm5
-
- pxor mm7, mm4
- psubw mm7, mm4;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
-
- movq mm1, mm7
- movq [rdi+8], mm7
-
- mov rax, arg(3) ;dequant_ptr
- movq mm6, [rax+8]
-
- pmullw mm7, mm6
- mov rax, arg(7) ;dqcoeff_ptr
-
- movq [rax+8], mm7
-
-
- ; next 8
- movq mm4, [rsi+16]
-
- mov rax, arg(1) ;zbin_ptr
- movq mm5, [rax+16]
-
- movq mm7, mm4
- psraw mm4, 15
-
- pxor mm7, mm4
- psubw mm7, mm4 ; abs
-
- movq mm6, mm7
- pcmpgtw mm5, mm6
-
- pandn mm5, mm6
- movq mm7, mm5
-
- movq mm5, [rdx+16]
- movq mm6, [rcx+16]
-
- paddw mm7, mm6
- pmulhuw mm7, mm5
-
- pxor mm7, mm4
- psubw mm7, mm4;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
-
- movq mm1, mm7
- movq [rdi+16], mm7
-
- mov rax, arg(3) ;dequant_ptr
- movq mm6, [rax+16]
-
- pmullw mm7, mm6
- mov rax, arg(7) ;dqcoeff_ptr
-
- movq [rax+16], mm7
-
-
- ; next 8
- movq mm4, [rsi+24]
-
- mov rax, arg(1) ;zbin_ptr
- movq mm5, [rax+24]
-
- movq mm7, mm4
- psraw mm4, 15
-
- pxor mm7, mm4
- psubw mm7, mm4 ; abs
-
- movq mm6, mm7
- pcmpgtw mm5, mm6
-
- pandn mm5, mm6
- movq mm7, mm5
-
- movq mm5, [rdx+24]
- movq mm6, [rcx+24]
-
- paddw mm7, mm6
- pmulhuw mm7, mm5
-
- pxor mm7, mm4
- psubw mm7, mm4;gain the sign back
-
- mov rdi, arg(2) ;qcoeff_ptr
-
- movq mm1, mm7
- movq [rdi+24], mm7
-
- mov rax, arg(3) ;dequant_ptr
- movq mm6, [rax+24]
-
- pmullw mm7, mm6
- mov rax, arg(7) ;dqcoeff_ptr
-
- movq [rax+24], mm7
-
-
-
- mov rdi, arg(4) ;scan_mask
- mov rsi, arg(2) ;qcoeff_ptr
-
- pxor mm5, mm5
- pxor mm7, mm7
-
- movq mm0, [rsi]
- movq mm1, [rsi+8]
-
- movq mm2, [rdi]
- movq mm3, [rdi+8];
-
- pcmpeqw mm0, mm7
- pcmpeqw mm1, mm7
-
- pcmpeqw mm6, mm6
- pxor mm0, mm6
-
- pxor mm1, mm6
- psrlw mm0, 15
-
- psrlw mm1, 15
- pmaddwd mm0, mm2
-
- pmaddwd mm1, mm3
- movq mm5, mm0
-
- paddd mm5, mm1
-
- movq mm0, [rsi+16]
- movq mm1, [rsi+24]
-
- movq mm2, [rdi+16]
- movq mm3, [rdi+24];
-
- pcmpeqw mm0, mm7
- pcmpeqw mm1, mm7
-
- pcmpeqw mm6, mm6
- pxor mm0, mm6
-
- pxor mm1, mm6
- psrlw mm0, 15
-
- psrlw mm1, 15
- pmaddwd mm0, mm2
-
- pmaddwd mm1, mm3
- paddd mm5, mm0
-
- paddd mm5, mm1
- movq mm0, mm5
-
- psrlq mm5, 32
- paddd mm0, mm5
-
- ; eob adjustment begins here
- movq rcx, mm0
- and rcx, 0xffff
-
- xor rdx, rdx
- sub rdx, rcx ; rdx=-rcx
-
- bsr rax, rcx
- inc rax
-
- sar rdx, 31
- and rax, rdx
- ; Substitute the sse assembly for the old mmx mixed assembly/C. The
- ; following is kept as reference
- ; movq rcx, mm0
- ; bsr rax, rcx
- ;
- ; mov eob, rax
- ; mov eee, rcx
- ;
- ;if(eee==0)
- ;{
- ; eob=-1;
- ;}
- ;else if(eee<0)
- ;{
- ; eob=15;
- ;}
- ;d->eob = eob+1;
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
#include <tmmintrin.h> /* SSSE3 */
+#include "./vp8_rtcd.h"
#include "vp8/encoder/block.h"
/* bitscan reverse (bsr) */
+++ /dev/null
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vpx_ports/x86.h"
-#include "vp8/encoder/block.h"
-
-int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
- short *qcoeff_ptr, short *dequant_ptr,
- const short *scan_mask, short *round_ptr,
- short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) {
- const short *scan_mask = vp8_default_zig_zag_mask;
- short *coeff_ptr = b->coeff;
- short *zbin_ptr = b->zbin;
- short *round_ptr = b->round;
- short *quant_ptr = b->quant_fast;
- short *qcoeff_ptr = d->qcoeff;
- short *dqcoeff_ptr = d->dqcoeff;
- short *dequant_ptr = d->dequant;
-
- *d->eob = (char)vp8_fast_quantize_b_impl_mmx(
- coeff_ptr, zbin_ptr, qcoeff_ptr, dequant_ptr, scan_mask,
-
- round_ptr, quant_ptr, dqcoeff_ptr);
-}
}
static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
- unsigned int data_sz, vpx_codec_err_t *res) {
+ unsigned int data_sz,
+ volatile vpx_codec_err_t *res) {
*res = VPX_CODEC_OK;
if (ctx->fragments.count == 0) {
static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
const uint8_t *data, unsigned int data_sz,
void *user_priv, long deadline) {
- vpx_codec_err_t res = VPX_CODEC_OK;
+ volatile vpx_codec_err_t res;
unsigned int resolution_change = 0;
unsigned int w, h;
}
}
-extern int vp8dx_references_buffer(VP8_COMMON *oci, int ref_frame);
static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
va_list args) {
int *ref_info = va_arg(args, int *);
VP8_CX_SRCS-yes += encoder/encodemb.c
VP8_CX_SRCS-yes += encoder/encodemv.c
VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.h
VP8_CX_SRCS-yes += encoder/firstpass.c
VP8_CX_SRCS-yes += encoder/block.h
VP8_CX_SRCS-yes += encoder/boolhuff.h
VP8_CX_SRCS-yes += encoder/onyx_if.c
VP8_CX_SRCS-yes += encoder/pickinter.c
VP8_CX_SRCS-yes += encoder/picklpf.c
+VP8_CX_SRCS-yes += encoder/picklpf.h
VP8_CX_SRCS-yes += encoder/vp8_quantize.c
VP8_CX_SRCS-yes += encoder/ratectrl.c
VP8_CX_SRCS-yes += encoder/rdopt.c
VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
VP8_CX_SRCS-yes += encoder/temporal_filter.c
+VP8_CX_SRCS-yes += encoder/temporal_filter.h
VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
ifeq ($(CONFIG_REALTIME_ONLY),yes)
VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
+VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h
endif
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
ifeq ($(CONFIG_REALTIME_ONLY),yes)
__m128i in[2];
const __m128i eight = _mm_set1_epi16(8);
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8);
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8);
switch (tx_type) {
case 0: // DCT_DCT
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
// load input data
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 1);
- in[2] = load_input_data(input + 8 * 2);
- in[3] = load_input_data(input + 8 * 3);
- in[4] = load_input_data(input + 8 * 4);
- in[5] = load_input_data(input + 8 * 5);
- in[6] = load_input_data(input + 8 * 6);
- in[7] = load_input_data(input + 8 * 7);
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8 * 1);
+ in[2] = load_input_data8(input + 8 * 2);
+ in[3] = load_input_data8(input + 8 * 3);
+ in[4] = load_input_data8(input + 8 * 4);
+ in[5] = load_input_data8(input + 8 * 5);
+ in[6] = load_input_data8(input + 8 * 6);
+ in[7] = load_input_data8(input + 8 * 7);
switch (tx_type) {
case 0: // DCT_DCT
int dummy;
};
-struct ALT_REF_AQ *vp9_alt_ref_aq_create() {
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) {
return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ));
}
*
* \return Instance of the class
*/
-struct ALT_REF_AQ *vp9_alt_ref_aq_create();
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void);
/*!\brief Upload segmentation_map to self object
*
int target_refresh = 0;
double weight_segment_target = 0;
double weight_segment = 0;
+ int thresh_low_motion = (cm->width < 720) ? 55 : 20;
cr->apply_cyclic_refresh = 1;
if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 ||
- (!cpi->use_svc && rc->avg_frame_low_motion < 55 &&
+ (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion &&
rc->frames_since_key > 40)) {
cr->apply_cyclic_refresh = 0;
return;
return 0;
}
-int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, int width,
- int height, int content_state) {
+static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+ int width, int height,
+ int content_state) {
if (speed >= 8) {
if (width <= 640 && height <= 480)
return (5 * threshold_base) >> 2;
if (tmp_variance < (tmp_sse >> 3) && (tmp_sse - tmp_variance) > 10000)
x->content_state_sb = kLowVarHighSumdiff;
+ if (tmp_sad > (avg_source_sad_threshold << 1))
+ x->content_state_sb = kVeryHighSad;
+
if (cpi->content_state_sb_fd != NULL) {
if (tmp_sad < avg_source_sad_threshold2) {
// Cap the increment to 255.
set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
- x->sb_is_skin = skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
+ if (cpi->use_skin_detection)
+ x->sb_is_skin =
+ skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
d = xd->plane[0].dst.buf;
dp = xd->plane[0].dst.stride;
{ 10, 6 }, { 8, 5 },
};
-#define USE_GREEDY_OPTIMIZE_B 0
+#define USE_GREEDY_OPTIMIZE_B 1
#if USE_GREEDY_OPTIMIZE_B
vp9_update_noise_estimate(cpi);
- // Scene detection is used for VBR mode or screen-content case.
- // Make sure compute_source_sad_onepass is set (which handles SVC case
- // and dynamic resize).
+ // Scene detection is always used for VBR mode or screen-content case.
+ // For other cases (e.g., CBR mode) use it for 5 <= speed < 8 for now
+ // (need to check encoding time cost for doing this for speed 8).
if (cpi->compute_source_sad_onepass &&
(cpi->oxcf.rc_mode == VPX_VBR ||
- cpi->oxcf.content == VP9E_CONTENT_SCREEN))
+ cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+ (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8)))
vp9_scene_detection_onepass(cpi);
// For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame
kHighSadLowSumdiff = 3,
kHighSadHighSumdiff = 4,
kLowVarHighSumdiff = 5,
+ kVeryHighSad = 6,
} CONTENT_STATE_SB;
typedef struct VP9EncoderConfig {
if (cpi->oxcf.speed >= 8 && !cpi->use_svc &&
((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content ||
- x->last_sb_high_content > 40))
+ x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120))
usable_ref_frame = LAST_FRAME;
for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
continue;
}
- if ((cpi->sf.short_circuit_low_temp_var >= 2 ||
+ if (x->content_state_sb != kVeryHighSad &&
+ (cpi->sf.short_circuit_low_temp_var >= 2 ||
(cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) &&
force_skip_low_temp_var && ref_frame == LAST_FRAME &&
this_mode == NEWMV) {
const int bpm =
(int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
return VPXMAX(FRAME_OVERHEAD_BITS,
- (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+ (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS));
}
int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
rc->af_ratio_onepass_vbr = 10;
rc->prev_avg_source_sad_lag = 0;
rc->high_source_sad = 0;
+ rc->reset_high_source_sad = 0;
rc->high_source_sad_lagindex = -1;
rc->alt_ref_gf_group = 0;
rc->fac_active_worst_inter = 150;
// In CBR mode, this makes sure q is between oscillating Qs to prevent
// resonance.
- if (cpi->oxcf.rc_mode == VPX_CBR &&
+ if (cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.reset_high_source_sad &&
(!cpi->oxcf.gf_cbr_boost_pct ||
!(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
(cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
int active_worst_quality;
int ambient_qp;
unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
- if (cm->frame_type == KEY_FRAME) return rc->worst_quality;
+ if (cm->frame_type == KEY_FRAME || rc->reset_high_source_sad)
+ return rc->worst_quality;
// For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
// for the first few frames following key frame. These are both initialized
// to worst_quality and updated with (3/4, 1/4) average in postencode_update.
if (oxcf->pass == 0) {
if (cm->frame_type != KEY_FRAME) compute_frame_low_motion(cpi);
}
+ if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0;
}
void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
return resize_action;
}
-void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
+static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi,
+ uint64_t avg_sad_current) {
VP9_COMMON *const cm = &cpi->common;
RATE_CONTROL *const rc = &cpi->rc;
int target;
}
}
}
+ // For CBR non-screen content mode, check if we should reset the rate
+ // control. Reset is done if high_source_sad is detected and the rate
+ // control is at very low QP with rate correction factor at min level.
+ if (cpi->oxcf.rc_mode == VPX_CBR &&
+ cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) {
+ if (rc->high_source_sad && rc->last_q[INTER_FRAME] == rc->best_quality &&
+ rc->avg_frame_qindex[INTER_FRAME] < (rc->best_quality << 1) &&
+ rc->rate_correction_factors[INTER_NORMAL] == MIN_BPB_FACTOR) {
+ rc->rate_correction_factors[INTER_NORMAL] = 0.5;
+ rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+ rc->buffer_level = rc->optimal_buffer_level;
+ rc->bits_off_target = rc->optimal_buffer_level;
+ rc->reset_high_source_sad = 1;
+ }
+ if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad)
+ rc->this_frame_target = rc->avg_frame_bandwidth;
+ }
// For VBR, under scene change/high content change, force golden refresh.
if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME &&
rc->high_source_sad && rc->frames_to_key > 3 &&
int avg_frame_low_motion;
int af_ratio_onepass_vbr;
int force_qpmin;
+ int reset_high_source_sad;
} RATE_CONTROL;
struct VP9_COMP;
#include <assert.h>
#include <smmintrin.h>
+#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/postproc.h"
#include "vpx_ports/mem.h"
void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
--- /dev/null
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+// Most gcc 4.9 distributions outside of Android do not generate correct code
+// for this function.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+ __GNUC__ == 4 && __GNUC_MINOR__ <= 9
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+ vpx_fdct32x32_c(input, output, stride);
+}
+
+#else
+
+#define LOAD_INCREMENT(src, stride, dest, index) \
+ do { \
+ dest[index] = vld1q_s16(src); \
+ src += stride; \
+ } while (0)
+
+#define ADD_S16(src, index0, index1, dest, index3) \
+ do { \
+ dest[index3] = vaddq_s16(src[index0], src[index1]); \
+ } while (0)
+
+#define ADD_SHIFT_S16(src, index0, index1) \
+ do { \
+ src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
+ } while (0)
+
+// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
+// middle
+// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
+ const int16_t *a_end = a + 24 * stride;
+ int16x8_t c[8];
+
+ LOAD_INCREMENT(a, stride, b, 0);
+ LOAD_INCREMENT(a, stride, b, 1);
+ LOAD_INCREMENT(a, stride, b, 2);
+ LOAD_INCREMENT(a, stride, b, 3);
+ LOAD_INCREMENT(a, stride, b, 4);
+ LOAD_INCREMENT(a, stride, b, 5);
+ LOAD_INCREMENT(a, stride, b, 6);
+ LOAD_INCREMENT(a, stride, b, 7);
+
+ LOAD_INCREMENT(a_end, stride, b, 24);
+ LOAD_INCREMENT(a_end, stride, b, 25);
+ LOAD_INCREMENT(a_end, stride, b, 26);
+ LOAD_INCREMENT(a_end, stride, b, 27);
+ LOAD_INCREMENT(a_end, stride, b, 28);
+ LOAD_INCREMENT(a_end, stride, b, 29);
+ LOAD_INCREMENT(a_end, stride, b, 30);
+ LOAD_INCREMENT(a_end, stride, b, 31);
+
+ ADD_S16(b, 0, 31, c, 0);
+ ADD_S16(b, 1, 30, c, 1);
+ ADD_S16(b, 2, 29, c, 2);
+ ADD_S16(b, 3, 28, c, 3);
+ ADD_S16(b, 4, 27, c, 4);
+ ADD_S16(b, 5, 26, c, 5);
+ ADD_S16(b, 6, 25, c, 6);
+ ADD_S16(b, 7, 24, c, 7);
+
+ ADD_SHIFT_S16(b, 7, 24);
+ ADD_SHIFT_S16(b, 6, 25);
+ ADD_SHIFT_S16(b, 5, 26);
+ ADD_SHIFT_S16(b, 4, 27);
+ ADD_SHIFT_S16(b, 3, 28);
+ ADD_SHIFT_S16(b, 2, 29);
+ ADD_SHIFT_S16(b, 1, 30);
+ ADD_SHIFT_S16(b, 0, 31);
+
+ b[0] = vshlq_n_s16(c[0], 2);
+ b[1] = vshlq_n_s16(c[1], 2);
+ b[2] = vshlq_n_s16(c[2], 2);
+ b[3] = vshlq_n_s16(c[3], 2);
+ b[4] = vshlq_n_s16(c[4], 2);
+ b[5] = vshlq_n_s16(c[5], 2);
+ b[6] = vshlq_n_s16(c[6], 2);
+ b[7] = vshlq_n_s16(c[7], 2);
+
+ LOAD_INCREMENT(a, stride, b, 8);
+ LOAD_INCREMENT(a, stride, b, 9);
+ LOAD_INCREMENT(a, stride, b, 10);
+ LOAD_INCREMENT(a, stride, b, 11);
+ LOAD_INCREMENT(a, stride, b, 12);
+ LOAD_INCREMENT(a, stride, b, 13);
+ LOAD_INCREMENT(a, stride, b, 14);
+ LOAD_INCREMENT(a, stride, b, 15);
+ LOAD_INCREMENT(a, stride, b, 16);
+ LOAD_INCREMENT(a, stride, b, 17);
+ LOAD_INCREMENT(a, stride, b, 18);
+ LOAD_INCREMENT(a, stride, b, 19);
+ LOAD_INCREMENT(a, stride, b, 20);
+ LOAD_INCREMENT(a, stride, b, 21);
+ LOAD_INCREMENT(a, stride, b, 22);
+ LOAD_INCREMENT(a, stride, b, 23);
+
+ ADD_S16(b, 8, 23, c, 0);
+ ADD_S16(b, 9, 22, c, 1);
+ ADD_S16(b, 10, 21, c, 2);
+ ADD_S16(b, 11, 20, c, 3);
+ ADD_S16(b, 12, 19, c, 4);
+ ADD_S16(b, 13, 18, c, 5);
+ ADD_S16(b, 14, 17, c, 6);
+ ADD_S16(b, 15, 16, c, 7);
+
+ ADD_SHIFT_S16(b, 15, 16);
+ ADD_SHIFT_S16(b, 14, 17);
+ ADD_SHIFT_S16(b, 13, 18);
+ ADD_SHIFT_S16(b, 12, 19);
+ ADD_SHIFT_S16(b, 11, 20);
+ ADD_SHIFT_S16(b, 10, 21);
+ ADD_SHIFT_S16(b, 9, 22);
+ ADD_SHIFT_S16(b, 8, 23);
+
+ b[8] = vshlq_n_s16(c[0], 2);
+ b[9] = vshlq_n_s16(c[1], 2);
+ b[10] = vshlq_n_s16(c[2], 2);
+ b[11] = vshlq_n_s16(c[3], 2);
+ b[12] = vshlq_n_s16(c[4], 2);
+ b[13] = vshlq_n_s16(c[5], 2);
+ b[14] = vshlq_n_s16(c[6], 2);
+ b[15] = vshlq_n_s16(c[7], 2);
+}
+
+#undef LOAD_INCREMENT
+#undef ADD_S16
+#undef ADD_SHIFT_S16
+
+#define STORE_S16(src, index, dest) \
+ do { \
+ store_s16q_to_tran_low(dest, src[index]); \
+ dest += 8; \
+ } while (0);
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+ STORE_S16(b, 0, a);
+ STORE_S16(b, 8, a);
+ STORE_S16(b, 16, a);
+ STORE_S16(b, 24, a);
+ STORE_S16(b, 1, a);
+ STORE_S16(b, 9, a);
+ STORE_S16(b, 17, a);
+ STORE_S16(b, 25, a);
+ STORE_S16(b, 2, a);
+ STORE_S16(b, 10, a);
+ STORE_S16(b, 18, a);
+ STORE_S16(b, 26, a);
+ STORE_S16(b, 3, a);
+ STORE_S16(b, 11, a);
+ STORE_S16(b, 19, a);
+ STORE_S16(b, 27, a);
+ STORE_S16(b, 4, a);
+ STORE_S16(b, 12, a);
+ STORE_S16(b, 20, a);
+ STORE_S16(b, 28, a);
+ STORE_S16(b, 5, a);
+ STORE_S16(b, 13, a);
+ STORE_S16(b, 21, a);
+ STORE_S16(b, 29, a);
+ STORE_S16(b, 6, a);
+ STORE_S16(b, 14, a);
+ STORE_S16(b, 22, a);
+ STORE_S16(b, 30, a);
+ STORE_S16(b, 7, a);
+ STORE_S16(b, 15, a);
+ STORE_S16(b, 23, a);
+ STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_high_t constant,
+ int16x8_t *add, int16x8_t *sub) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+ const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+ const tran_high_t constant0,
+ const tran_high_t constant1,
+ int16x8_t *add, int16x8_t *sub) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
+ const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
+ const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
+ const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
+ const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
+ const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+ const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+ const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+ const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+ *add = vcombine_s16(rounded0, rounded1);
+ *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
+ const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+ const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+ const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+ return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+
+ // Stage 1: Done as part of the load.
+
+ // Stage 2.
+ // Mini cross. X the first 16 values and the middle 8 of the second half.
+ a[0] = vaddq_s16(in[0], in[15]);
+ a[1] = vaddq_s16(in[1], in[14]);
+ a[2] = vaddq_s16(in[2], in[13]);
+ a[3] = vaddq_s16(in[3], in[12]);
+ a[4] = vaddq_s16(in[4], in[11]);
+ a[5] = vaddq_s16(in[5], in[10]);
+ a[6] = vaddq_s16(in[6], in[9]);
+ a[7] = vaddq_s16(in[7], in[8]);
+
+ a[8] = vsubq_s16(in[7], in[8]);
+ a[9] = vsubq_s16(in[6], in[9]);
+ a[10] = vsubq_s16(in[5], in[10]);
+ a[11] = vsubq_s16(in[4], in[11]);
+ a[12] = vsubq_s16(in[3], in[12]);
+ a[13] = vsubq_s16(in[2], in[13]);
+ a[14] = vsubq_s16(in[1], in[14]);
+ a[15] = vsubq_s16(in[0], in[15]);
+
+ a[16] = in[16];
+ a[17] = in[17];
+ a[18] = in[18];
+ a[19] = in[19];
+
+ butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
+ butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
+ butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
+ butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
+
+ a[28] = in[28];
+ a[29] = in[29];
+ a[30] = in[30];
+ a[31] = in[31];
+
+ // Stage 3.
+ b[0] = vaddq_s16(a[0], a[7]);
+ b[1] = vaddq_s16(a[1], a[6]);
+ b[2] = vaddq_s16(a[2], a[5]);
+ b[3] = vaddq_s16(a[3], a[4]);
+
+ b[4] = vsubq_s16(a[3], a[4]);
+ b[5] = vsubq_s16(a[2], a[5]);
+ b[6] = vsubq_s16(a[1], a[6]);
+ b[7] = vsubq_s16(a[0], a[7]);
+
+ b[8] = a[8];
+ b[9] = a[9];
+
+ butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+ butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+ b[14] = a[14];
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(in[16], a[23]);
+ b[17] = vaddq_s16(in[17], a[22]);
+ b[18] = vaddq_s16(in[18], a[21]);
+ b[19] = vaddq_s16(in[19], a[20]);
+
+ b[20] = vsubq_s16(in[19], a[20]);
+ b[21] = vsubq_s16(in[18], a[21]);
+ b[22] = vsubq_s16(in[17], a[22]);
+ b[23] = vsubq_s16(in[16], a[23]);
+
+ b[24] = vsubq_s16(in[31], a[24]);
+ b[25] = vsubq_s16(in[30], a[25]);
+ b[26] = vsubq_s16(in[29], a[26]);
+ b[27] = vsubq_s16(in[28], a[27]);
+
+ b[28] = vaddq_s16(in[28], a[27]);
+ b[29] = vaddq_s16(in[29], a[26]);
+ b[30] = vaddq_s16(in[30], a[25]);
+ b[31] = vaddq_s16(in[31], a[24]);
+
+ // Stage 4.
+ a[0] = vaddq_s16(b[0], b[3]);
+ a[1] = vaddq_s16(b[1], b[2]);
+ a[2] = vsubq_s16(b[1], b[2]);
+ a[3] = vsubq_s16(b[0], b[3]);
+
+ a[4] = b[4];
+
+ butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+ a[7] = b[7];
+
+ a[8] = vaddq_s16(b[8], b[11]);
+ a[9] = vaddq_s16(b[9], b[10]);
+ a[10] = vsubq_s16(b[9], b[10]);
+ a[11] = vsubq_s16(b[8], b[11]);
+ a[12] = vsubq_s16(b[15], b[12]);
+ a[13] = vsubq_s16(b[14], b[13]);
+ a[14] = vaddq_s16(b[14], b[13]);
+ a[15] = vaddq_s16(b[15], b[12]);
+
+ a[16] = b[16];
+ a[17] = b[17];
+
+ butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
+ butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
+ butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
+ butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
+
+ a[22] = b[22];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[25] = b[25];
+
+ a[30] = b[30];
+ a[31] = b[31];
+
+ // Stage 5.
+ butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+ butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
+
+ b[4] = vaddq_s16(a[4], a[5]);
+ b[5] = vsubq_s16(a[4], a[5]);
+ b[6] = vsubq_s16(a[7], a[6]);
+ b[7] = vaddq_s16(a[7], a[6]);
+
+ b[8] = a[8];
+
+ butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
+ butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
+
+ b[11] = a[11];
+ b[12] = a[12];
+
+ b[15] = a[15];
+
+ b[16] = vaddq_s16(a[19], a[16]);
+ b[17] = vaddq_s16(a[18], a[17]);
+ b[18] = vsubq_s16(a[17], a[18]);
+ b[19] = vsubq_s16(a[16], a[19]);
+ b[20] = vsubq_s16(a[23], a[20]);
+ b[21] = vsubq_s16(a[22], a[21]);
+ b[22] = vaddq_s16(a[21], a[22]);
+ b[23] = vaddq_s16(a[20], a[23]);
+ b[24] = vaddq_s16(a[27], a[24]);
+ b[25] = vaddq_s16(a[26], a[25]);
+ b[26] = vsubq_s16(a[25], a[26]);
+ b[27] = vsubq_s16(a[24], a[27]);
+ b[28] = vsubq_s16(a[31], a[28]);
+ b[29] = vsubq_s16(a[30], a[29]);
+ b[30] = vaddq_s16(a[29], a[30]);
+ b[31] = vaddq_s16(a[28], a[31]);
+
+ // Stage 6.
+ a[0] = b[0];
+ a[1] = b[1];
+ a[2] = b[2];
+ a[3] = b[3];
+
+ butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
+ butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
+
+ a[8] = vaddq_s16(b[8], b[9]);
+ a[9] = vsubq_s16(b[8], b[9]);
+ a[10] = vsubq_s16(b[11], b[10]);
+ a[11] = vaddq_s16(b[11], b[10]);
+ a[12] = vaddq_s16(b[12], b[13]);
+ a[13] = vsubq_s16(b[12], b[13]);
+ a[14] = vsubq_s16(b[15], b[14]);
+ a[15] = vaddq_s16(b[15], b[14]);
+
+ a[16] = b[16];
+ a[19] = b[19];
+ a[20] = b[20];
+ a[23] = b[23];
+ a[24] = b[24];
+ a[27] = b[27];
+ a[28] = b[28];
+ a[31] = b[31];
+
+ butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
+ butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
+ butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
+
+ // Stage 7.
+ b[0] = a[0];
+ b[1] = a[1];
+ b[2] = a[2];
+ b[3] = a[3];
+ b[4] = a[4];
+ b[5] = a[5];
+ b[6] = a[6];
+ b[7] = a[7];
+
+ butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
+ butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
+ butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
+ butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
+
+ b[16] = vaddq_s16(a[16], a[17]);
+ b[17] = vsubq_s16(a[16], a[17]);
+ b[18] = vsubq_s16(a[19], a[18]);
+ b[19] = vaddq_s16(a[19], a[18]);
+ b[20] = vaddq_s16(a[20], a[21]);
+ b[21] = vsubq_s16(a[20], a[21]);
+ b[22] = vsubq_s16(a[23], a[22]);
+ b[23] = vaddq_s16(a[23], a[22]);
+ b[24] = vaddq_s16(a[24], a[25]);
+ b[25] = vsubq_s16(a[24], a[25]);
+ b[26] = vsubq_s16(a[27], a[26]);
+ b[27] = vaddq_s16(a[27], a[26]);
+ b[28] = vaddq_s16(a[28], a[29]);
+ b[29] = vsubq_s16(a[28], a[29]);
+ b[30] = vsubq_s16(a[31], a[30]);
+ b[31] = vaddq_s16(a[31], a[30]);
+
+ // Final stage.
+ // Also compute partial rounding shift:
+ // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ out[0] = sub_round_shift(b[0]);
+ out[16] = sub_round_shift(b[1]);
+ out[8] = sub_round_shift(b[2]);
+ out[24] = sub_round_shift(b[3]);
+ out[4] = sub_round_shift(b[4]);
+ out[20] = sub_round_shift(b[5]);
+ out[12] = sub_round_shift(b[6]);
+ out[28] = sub_round_shift(b[7]);
+ out[2] = sub_round_shift(b[8]);
+ out[18] = sub_round_shift(b[9]);
+ out[10] = sub_round_shift(b[10]);
+ out[26] = sub_round_shift(b[11]);
+ out[6] = sub_round_shift(b[12]);
+ out[22] = sub_round_shift(b[13]);
+ out[14] = sub_round_shift(b[14]);
+ out[30] = sub_round_shift(b[15]);
+
+ butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
+ out[1] = sub_round_shift(a[1]);
+ out[31] = sub_round_shift(a[31]);
+
+ butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
+ out[17] = sub_round_shift(a[17]);
+ out[15] = sub_round_shift(a[15]);
+
+ butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
+ out[9] = sub_round_shift(a[9]);
+ out[23] = sub_round_shift(a[23]);
+
+ butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
+ out[25] = sub_round_shift(a[25]);
+ out[7] = sub_round_shift(a[7]);
+
+ butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
+ out[5] = sub_round_shift(a[5]);
+ out[27] = sub_round_shift(a[27]);
+
+ butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
+ out[21] = sub_round_shift(a[21]);
+ out[11] = sub_round_shift(a[11]);
+
+ butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
+ out[13] = sub_round_shift(a[13]);
+ out[19] = sub_round_shift(a[19]);
+
+ butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
+ out[29] = sub_round_shift(a[29]);
+ out[3] = sub_round_shift(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element) \
+ do { \
+ dst##_lo[element] = src##_lo[element]; \
+ dst##_hi[element] = src##_hi[element]; \
+ } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = \
+ vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+ b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]), \
+ vget_high_s16(a[right_index])); \
+ } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index) \
+ do { \
+ c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index])); \
+ c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+ } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+ do { \
+ temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index])); \
+ temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index])); \
+ c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]); \
+ c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]); \
+ } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index) \
+ do { \
+ b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+ b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+ } while (0)
+
+// Like butterfly_one_coeff, but don't narrow results.
+static INLINE void butterfly_one_coeff_s16_s32(
+ const int16x8_t a, const int16x8_t b, const tran_high_t constant,
+ int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+ int32x4_t *sub_hi) {
+ const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+ const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+ const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+ const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+ const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b, \
+ add_index, sub_index) \
+ do { \
+ butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+ &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+// Like butterfly_one_coeff, but with s32.
+static INLINE void butterfly_one_coeff_s32(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_high_t constant, int32x4_t *add_lo,
+ int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ // TODO(johannkoenig): Strangely there is only a conversion warning on int64_t
+ // to int32_t (const tran_high_t (aka const long long)) but not for int64_t to
+ // int16_t. The constants fit in int16_t. Investigate using int16_t for the
+ // constants to avoid bouncing between types.
+ const int32_t constant_s32 = (int32_t)constant;
+ const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant_s32);
+ const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant_s32);
+ const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant_s32);
+ const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant_s32);
+ const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant_s32);
+ const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant_s32);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
+ sub_index) \
+ do { \
+ butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
+ a##_lo[right_index], a##_hi[right_index], \
+ constant, &b##_lo[add_index], &b##_hi[add_index], \
+ &b##_lo[sub_index], &b##_hi[sub_index]); \
+ } while (0)
+
+// Like butterfly_two_coeff, but with s32.
+static INLINE void butterfly_two_coeff_s32(
+ const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+ const int32x4_t b_hi, const tran_high_t constant0,
+ const tran_high_t constant1, int32x4_t *add_lo, int32x4_t *add_hi,
+ int32x4_t *sub_lo, int32x4_t *sub_hi) {
+ const int32_t constant0_s32 = (int32_t)constant0;
+ const int32_t constant1_s32 = (int32_t)constant1;
+ const int32x4_t a0 = vmulq_n_s32(a_lo, constant0_s32);
+ const int32x4_t a1 = vmulq_n_s32(a_hi, constant0_s32);
+ const int32x4_t a2 = vmulq_n_s32(a_lo, constant1_s32);
+ const int32x4_t a3 = vmulq_n_s32(a_hi, constant1_s32);
+ const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0_s32);
+ const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0_s32);
+ const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1_s32);
+ const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1_s32);
+ *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+ *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+ *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+ *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant, \
+ right_constant, b, add_index, sub_index) \
+ do { \
+ butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index], \
+ a##_lo[right_index], a##_hi[right_index], \
+ left_constant, right_constant, &b##_lo[add_index], \
+ &b##_hi[add_index], &b##_lo[sub_index], \
+ &b##_hi[sub_index]); \
+ } while (0)
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
+ const int32x4_t a_hi) {
+ const int32x4_t one = vdupq_n_s32(1);
+ const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+ const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+ const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+ const int16x4_t b_lo =
+ vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+ const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+ const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+ const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+ const int16x4_t b_hi =
+ vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+ return vcombine_s16(b_lo, b_hi);
+}
+
+static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+ int16x8_t a[32];
+ int16x8_t b[32];
+ int32x4_t c_lo[32];
+ int32x4_t c_hi[32];
+ int32x4_t d_lo[32];
+ int32x4_t d_hi[32];
+
+ // Stage 1. Done as part of the load for the first pass.
+ a[0] = vaddq_s16(in[0], in[31]);
+ a[1] = vaddq_s16(in[1], in[30]);
+ a[2] = vaddq_s16(in[2], in[29]);
+ a[3] = vaddq_s16(in[3], in[28]);
+ a[4] = vaddq_s16(in[4], in[27]);
+ a[5] = vaddq_s16(in[5], in[26]);
+ a[6] = vaddq_s16(in[6], in[25]);
+ a[7] = vaddq_s16(in[7], in[24]);
+ a[8] = vaddq_s16(in[8], in[23]);
+ a[9] = vaddq_s16(in[9], in[22]);
+ a[10] = vaddq_s16(in[10], in[21]);
+ a[11] = vaddq_s16(in[11], in[20]);
+ a[12] = vaddq_s16(in[12], in[19]);
+ a[13] = vaddq_s16(in[13], in[18]);
+ a[14] = vaddq_s16(in[14], in[17]);
+ a[15] = vaddq_s16(in[15], in[16]);
+ a[16] = vsubq_s16(in[15], in[16]);
+ a[17] = vsubq_s16(in[14], in[17]);
+ a[18] = vsubq_s16(in[13], in[18]);
+ a[19] = vsubq_s16(in[12], in[19]);
+ a[20] = vsubq_s16(in[11], in[20]);
+ a[21] = vsubq_s16(in[10], in[21]);
+ a[22] = vsubq_s16(in[9], in[22]);
+ a[23] = vsubq_s16(in[8], in[23]);
+ a[24] = vsubq_s16(in[7], in[24]);
+ a[25] = vsubq_s16(in[6], in[25]);
+ a[26] = vsubq_s16(in[5], in[26]);
+ a[27] = vsubq_s16(in[4], in[27]);
+ a[28] = vsubq_s16(in[3], in[28]);
+ a[29] = vsubq_s16(in[2], in[29]);
+ a[30] = vsubq_s16(in[1], in[30]);
+ a[31] = vsubq_s16(in[0], in[31]);
+
+ // Stage 2.
+ b[0] = vaddq_s16(a[0], a[15]);
+ b[1] = vaddq_s16(a[1], a[14]);
+ b[2] = vaddq_s16(a[2], a[13]);
+ b[3] = vaddq_s16(a[3], a[12]);
+ b[4] = vaddq_s16(a[4], a[11]);
+ b[5] = vaddq_s16(a[5], a[10]);
+ b[6] = vaddq_s16(a[6], a[9]);
+ b[7] = vaddq_s16(a[7], a[8]);
+
+ b[8] = vsubq_s16(a[7], a[8]);
+ b[9] = vsubq_s16(a[6], a[9]);
+ b[10] = vsubq_s16(a[5], a[10]);
+ b[11] = vsubq_s16(a[4], a[11]);
+ b[12] = vsubq_s16(a[3], a[12]);
+ b[13] = vsubq_s16(a[2], a[13]);
+ b[14] = vsubq_s16(a[1], a[14]);
+ b[15] = vsubq_s16(a[0], a[15]);
+
+ b[16] = a[16];
+ b[17] = a[17];
+ b[18] = a[18];
+ b[19] = a[19];
+
+ butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+ butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+ butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+ butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+ b[28] = a[28];
+ b[29] = a[29];
+ b[30] = a[30];
+ b[31] = a[31];
+
+ // Stage 3. With extreme values for input this calculation rolls over int16_t.
+ // The sources for b[0] get added multiple times and, through testing, have
+ // been shown to overflow starting here.
+ ADD_S16_S32(b, 0, 7, c, 0);
+ ADD_S16_S32(b, 1, 6, c, 1);
+ ADD_S16_S32(b, 2, 5, c, 2);
+ ADD_S16_S32(b, 3, 4, c, 3);
+ SUB_S16_S32(b, 3, 4, c, 4);
+ SUB_S16_S32(b, 2, 5, c, 5);
+ SUB_S16_S32(b, 1, 6, c, 6);
+ SUB_S16_S32(b, 0, 7, c, 7);
+
+ a[8] = b[8];
+ a[9] = b[9];
+
+ BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+ BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+ a[14] = b[14];
+ a[15] = b[15];
+
+ ADD_S16_S32(b, 16, 23, c, 16);
+ ADD_S16_S32(b, 17, 22, c, 17);
+ ADD_S16_S32(b, 18, 21, c, 18);
+ ADD_S16_S32(b, 19, 20, c, 19);
+ SUB_S16_S32(b, 19, 20, c, 20);
+ SUB_S16_S32(b, 18, 21, c, 21);
+ SUB_S16_S32(b, 17, 22, c, 22);
+ SUB_S16_S32(b, 16, 23, c, 23);
+ SUB_S16_S32(b, 31, 24, c, 24);
+ SUB_S16_S32(b, 30, 25, c, 25);
+ SUB_S16_S32(b, 29, 26, c, 26);
+ SUB_S16_S32(b, 28, 27, c, 27);
+ ADD_S16_S32(b, 28, 27, c, 28);
+ ADD_S16_S32(b, 29, 26, c, 29);
+ ADD_S16_S32(b, 30, 25, c, 30);
+ ADD_S16_S32(b, 31, 24, c, 31);
+
+ // Stage 4.
+ ADD_S32(c, 0, 3, d, 0);
+ ADD_S32(c, 1, 2, d, 1);
+ SUB_S32(c, 1, 2, d, 2);
+ SUB_S32(c, 0, 3, d, 3);
+
+ PASS_THROUGH(c, d, 4);
+
+ BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+ PASS_THROUGH(c, d, 7);
+
+ ADDW_S16_S32(c, 11, a, 8, d, 8);
+ ADDW_S16_S32(c, 10, a, 9, d, 9);
+ SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+ SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+ SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+ SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+ ADDW_S16_S32(c, 13, b, 14, d, 14);
+ ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 17);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
+ BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
+ BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
+
+ PASS_THROUGH(c, d, 22);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 25);
+
+ PASS_THROUGH(c, d, 30);
+ PASS_THROUGH(c, d, 31);
+
+ // Stage 5.
+ BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+ BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
+
+ ADD_S32(d, 4, 5, c, 4);
+ SUB_S32(d, 4, 5, c, 5);
+ SUB_S32(d, 7, 6, c, 6);
+ ADD_S32(d, 7, 6, c, 7);
+
+ PASS_THROUGH(d, c, 8);
+
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
+ BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
+
+ PASS_THROUGH(d, c, 11);
+ PASS_THROUGH(d, c, 12);
+ PASS_THROUGH(d, c, 15);
+
+ ADD_S32(d, 16, 19, c, 16);
+ ADD_S32(d, 17, 18, c, 17);
+ SUB_S32(d, 17, 18, c, 18);
+ SUB_S32(d, 16, 19, c, 19);
+ SUB_S32(d, 23, 20, c, 20);
+ SUB_S32(d, 22, 21, c, 21);
+ ADD_S32(d, 22, 21, c, 22);
+ ADD_S32(d, 23, 20, c, 23);
+ ADD_S32(d, 24, 27, c, 24);
+ ADD_S32(d, 25, 26, c, 25);
+ SUB_S32(d, 25, 26, c, 26);
+ SUB_S32(d, 24, 27, c, 27);
+ SUB_S32(d, 31, 28, c, 28);
+ SUB_S32(d, 30, 29, c, 29);
+ ADD_S32(d, 30, 29, c, 30);
+ ADD_S32(d, 31, 28, c, 31);
+
+ // Stage 6.
+ PASS_THROUGH(c, d, 0);
+ PASS_THROUGH(c, d, 1);
+ PASS_THROUGH(c, d, 2);
+ PASS_THROUGH(c, d, 3);
+
+ BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
+ BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
+
+ ADD_S32(c, 8, 9, d, 8);
+ SUB_S32(c, 8, 9, d, 9);
+ SUB_S32(c, 11, 10, d, 10);
+ ADD_S32(c, 11, 10, d, 11);
+ ADD_S32(c, 12, 13, d, 12);
+ SUB_S32(c, 12, 13, d, 13);
+ SUB_S32(c, 15, 14, d, 14);
+ ADD_S32(c, 15, 14, d, 15);
+
+ PASS_THROUGH(c, d, 16);
+ PASS_THROUGH(c, d, 19);
+ PASS_THROUGH(c, d, 20);
+ PASS_THROUGH(c, d, 23);
+ PASS_THROUGH(c, d, 24);
+ PASS_THROUGH(c, d, 27);
+ PASS_THROUGH(c, d, 28);
+ PASS_THROUGH(c, d, 31);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
+ BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
+ BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
+
+ // Stage 7.
+ PASS_THROUGH(d, c, 0);
+ PASS_THROUGH(d, c, 1);
+ PASS_THROUGH(d, c, 2);
+ PASS_THROUGH(d, c, 3);
+ PASS_THROUGH(d, c, 4);
+ PASS_THROUGH(d, c, 5);
+ PASS_THROUGH(d, c, 6);
+ PASS_THROUGH(d, c, 7);
+
+ BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
+ BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
+ BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
+ BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
+
+ ADD_S32(d, 16, 17, c, 16);
+ SUB_S32(d, 16, 17, c, 17);
+ SUB_S32(d, 19, 18, c, 18);
+ ADD_S32(d, 19, 18, c, 19);
+ ADD_S32(d, 20, 21, c, 20);
+ SUB_S32(d, 20, 21, c, 21);
+ SUB_S32(d, 23, 22, c, 22);
+ ADD_S32(d, 23, 22, c, 23);
+ ADD_S32(d, 24, 25, c, 24);
+ SUB_S32(d, 24, 25, c, 25);
+ SUB_S32(d, 27, 26, c, 26);
+ ADD_S32(d, 27, 26, c, 27);
+ ADD_S32(d, 28, 29, c, 28);
+ SUB_S32(d, 28, 29, c, 29);
+ SUB_S32(d, 31, 30, c, 30);
+ ADD_S32(d, 31, 30, c, 31);
+
+ // Final stage.
+ // Roll rounding into this function so we can pass back int16x8.
+
+ out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
+ out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
+
+ out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
+ out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
+ out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
+ out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
+ out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
+
+ out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
+ out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
+ out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
+ out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
+
+ out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
+ out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
+ out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
+ out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
+ out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
+ out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
+ out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
+
+ BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
+ out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
+ out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
+
+ BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
+ out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
+ out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
+
+ BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
+ out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
+ out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
+
+ BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
+ out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
+ out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
+
+ BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
+ out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
+ out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
+
+ BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
+ out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
+ out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
+
+ BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
+ out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
+ out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
+// are all in-place.
+// TODO(johannkoenig): share with other fdcts.
+static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
+ // Swap 16 bit elements.
+ const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+ const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+ const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+ const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+ // Swap 32 bit elements.
+ const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+ vreinterpretq_s32_s16(c1.val[0]));
+ const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+ vreinterpretq_s32_s16(c1.val[1]));
+ const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+ vreinterpretq_s32_s16(c3.val[0]));
+ const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+ vreinterpretq_s32_s16(c3.val[1]));
+
+ // Swap 64 bit elements
+ const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+ const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+ const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+ const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+ b[0] = e0.val[0];
+ b[1] = e1.val[0];
+ b[2] = e2.val[0];
+ b[3] = e3.val[0];
+ b[4] = e0.val[1];
+ b[5] = e1.val[1];
+ b[6] = e2.val[1];
+ b[7] = e3.val[1];
+}
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+ int16x8_t temp0[32];
+ int16x8_t temp1[32];
+ int16x8_t temp2[32];
+ int16x8_t temp3[32];
+ int16x8_t temp4[32];
+ int16x8_t temp5[32];
+
+ // Process in 8x32 columns.
+ load(input, stride, temp0);
+ dct_body_first_pass(temp0, temp1);
+
+ load(input + 8, stride, temp0);
+ dct_body_first_pass(temp0, temp2);
+
+ load(input + 16, stride, temp0);
+ dct_body_first_pass(temp0, temp3);
+
+ load(input + 24, stride, temp0);
+ dct_body_first_pass(temp0, temp4);
+
+ // Generate the top row by munging the first set of 8 from each one together.
+ transpose_8x8(&temp1[0], &temp0[0]);
+ transpose_8x8(&temp2[0], &temp0[8]);
+ transpose_8x8(&temp3[0], &temp0[16]);
+ transpose_8x8(&temp4[0], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output, temp5);
+
+ // Second row of 8x32.
+ transpose_8x8(&temp1[8], &temp0[0]);
+ transpose_8x8(&temp2[8], &temp0[8]);
+ transpose_8x8(&temp3[8], &temp0[16]);
+ transpose_8x8(&temp4[8], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 8 * 32, temp5);
+
+ // Third row of 8x32
+ transpose_8x8(&temp1[16], &temp0[0]);
+ transpose_8x8(&temp2[16], &temp0[8]);
+ transpose_8x8(&temp3[16], &temp0[16]);
+ transpose_8x8(&temp4[16], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 16 * 32, temp5);
+
+ // Final row of 8x32.
+ transpose_8x8(&temp1[24], &temp0[0]);
+ transpose_8x8(&temp2[24], &temp0[8]);
+ transpose_8x8(&temp3[24], &temp0[16]);
+ transpose_8x8(&temp4[24], &temp0[24]);
+
+ dct_body_second_pass(temp0, temp5);
+
+ transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+ &temp5[5], &temp5[6], &temp5[7]);
+ transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+ &temp5[13], &temp5[14], &temp5[15]);
+ transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+ &temp5[21], &temp5[22], &temp5[23]);
+ transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+ &temp5[29], &temp5[30], &temp5[31]);
+ store(output + 24 * 32, temp5);
+}
+#endif // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+ // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
*/
#include <assert.h>
#include <stdlib.h>
+#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
const int16_t vpx_rv[] = {
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h
DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/fdct16x16_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/fdct32x32_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct8x8_add_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct16x16_add_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_idct32x32_add_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c
endif # !CONFIG_VP9_HIGHBITDEPTH
ifeq ($(HAVE_NEON_ASM),yes)
specialize qw/vpx_fdct16x16_1 sse2/;
add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct32x32 sse2/;
+ specialize qw/vpx_fdct32x32 neon sse2/;
add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct32x32_rd sse2/;
specialize qw/vpx_fdct16x16_1 sse2 msa/;
add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct32x32 sse2 avx2 msa/;
+ specialize qw/vpx_fdct32x32 neon sse2 avx2 msa/;
add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct32x32_rd sse2 avx2 msa/;
$vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2;
specialize qw/vpx_idct16x16_10_add neon sse2/;
specialize qw/vpx_idct16x16_1_add neon sse2/;
- specialize qw/vpx_idct32x32_1024_add neon sse2 ssse3/;
+ specialize qw/vpx_idct32x32_1024_add neon sse2/;
specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
- specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;
+ specialize qw/vpx_highbd_idct4x4_16_add neon sse2 sse4_1/;
specialize qw/vpx_highbd_idct8x8_64_add neon sse2/;
specialize qw/vpx_highbd_idct8x8_12_add neon sse2/;
specialize qw/vpx_highbd_idct16x16_256_add neon sse2/;
#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0,
const __m128i in1) {
return dct_const_round_shift_sse2(t2);
}
-static INLINE __m128i wraplow_16bit_sse2(const __m128i in0, const __m128i in1,
- const __m128i rounding) {
- __m128i temp[2];
- temp[0] = _mm_add_epi32(in0, rounding);
- temp[1] = _mm_add_epi32(in1, rounding);
- temp[0] = _mm_srai_epi32(temp[0], 4);
- temp[1] = _mm_srai_epi32(temp[1], 4);
- return _mm_packs_epi32(temp[0], temp[1]);
-}
-
static INLINE void highbd_idct4_small_sse2(__m128i *const io) {
const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0);
const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0);
return _mm_sub_epi64(out, sign);
}
-static INLINE __m128i dct_const_round_shift_64bit_sse2(const __m128i in) {
- const __m128i t = _mm_add_epi64(
- in,
- _mm_setr_epi32(DCT_CONST_ROUNDING << 2, 0, DCT_CONST_ROUNDING << 2, 0));
- return _mm_srli_si128(t, 2);
-}
-
-static INLINE __m128i pack_4_sse2(const __m128i in0, const __m128i in1) {
- const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 2
- const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 1, 3
- return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3
-}
-
static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
const __m128i cospi_p16_p16 =
_mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0);
temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p16_p16);
temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p16_p16);
temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p16_p16);
- temp1[0] = dct_const_round_shift_64bit_sse2(temp1[0]);
- temp1[1] = dct_const_round_shift_64bit_sse2(temp1[1]);
- temp2[0] = dct_const_round_shift_64bit_sse2(temp2[0]);
- temp2[1] = dct_const_round_shift_64bit_sse2(temp2[1]);
- step[0] = pack_4_sse2(temp1[0], temp1[1]);
- step[1] = pack_4_sse2(temp2[0], temp2[1]);
+ temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+ step[0] = pack_4(temp1[0], temp1[1]);
+ step[1] = pack_4(temp2[0], temp2[1]);
abs_extend_64bit_sse2(io[1], temp1, sign1);
abs_extend_64bit_sse2(io[3], temp2, sign2);
temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); // [1]*cospi_24 - [3]*cospi_8
temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); // [1]*cospi_8 + [3]*cospi_24
temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); // [1]*cospi_8 + [3]*cospi_24
- temp1[0] = dct_const_round_shift_64bit_sse2(temp1[0]);
- temp1[1] = dct_const_round_shift_64bit_sse2(temp1[1]);
- temp2[0] = dct_const_round_shift_64bit_sse2(temp2[0]);
- temp2[1] = dct_const_round_shift_64bit_sse2(temp2[1]);
- step[2] = pack_4_sse2(temp1[0], temp1[1]);
- step[3] = pack_4_sse2(temp2[0], temp2[1]);
+ temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+ step[2] = pack_4(temp1[0], temp1[1]);
+ step[3] = pack_4(temp2[0], temp2[1]);
// stage 2
io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
highbd_idct4_large_sse2(io);
highbd_idct4_large_sse2(io);
}
- io[0] = wraplow_16bit_sse2(io[0], io[1], _mm_set1_epi32(8));
- io[1] = wraplow_16bit_sse2(io[2], io[3], _mm_set1_epi32(8));
+ io[0] = wraplow_16bit(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit(io[2], io[3], _mm_set1_epi32(8));
}
- // Reconstruction and Store
- {
- __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
- __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi64(d0,
- _mm_loadl_epi64((const __m128i *)(dest + stride)));
- d2 = _mm_unpacklo_epi64(
- d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
- d0 = clamp_high_sse2(_mm_adds_epi16(d0, io[0]), bd);
- d2 = clamp_high_sse2(_mm_adds_epi16(d2, io[1]), bd);
- // store input0
- _mm_storel_epi64((__m128i *)dest, d0);
- // store input1
- d0 = _mm_srli_si128(d0, 8);
- _mm_storel_epi64((__m128i *)(dest + stride), d0);
- // store input2
- _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
- // store input3
- d2 = _mm_srli_si128(d2, 8);
- _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
- }
+ recon_and_store_4(dest, io, stride, bd);
}
void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
--- /dev/null
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static INLINE void extend_64bit(const __m128i in,
+ __m128i *const out /*out[2]*/) {
+ out[0] = _mm_unpacklo_epi32(in, in); // 0, 0, 1, 1
+ out[1] = _mm_unpackhi_epi32(in, in); // 2, 2, 3, 3
+}
+
+static INLINE void highbd_idct4(__m128i *const io) {
+ const __m128i cospi_p16_p16 =
+ _mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0);
+ const __m128i cospi_p08_p08 =
+ _mm_setr_epi32(cospi_8_64 << 2, 0, cospi_8_64 << 2, 0);
+ const __m128i cospi_p24_p24 =
+ _mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0);
+ __m128i temp1[4], temp2[4], step[4];
+
+ transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]);
+
+ // stage 1
+ temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
+ temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
+ extend_64bit(temp1[0], temp1);
+ extend_64bit(temp2[0], temp2);
+ temp1[0] = _mm_mul_epi32(temp1[0], cospi_p16_p16);
+ temp1[1] = _mm_mul_epi32(temp1[1], cospi_p16_p16);
+ temp2[0] = _mm_mul_epi32(temp2[0], cospi_p16_p16);
+ temp2[1] = _mm_mul_epi32(temp2[1], cospi_p16_p16);
+ temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+ step[0] = pack_4(temp1[0], temp1[1]);
+ step[1] = pack_4(temp2[0], temp2[1]);
+
+ extend_64bit(io[1], temp1);
+ extend_64bit(io[3], temp2);
+ temp1[2] = _mm_mul_epi32(temp1[0], cospi_p08_p08);
+ temp1[3] = _mm_mul_epi32(temp1[1], cospi_p08_p08);
+ temp1[0] = _mm_mul_epi32(temp1[0], cospi_p24_p24);
+ temp1[1] = _mm_mul_epi32(temp1[1], cospi_p24_p24);
+ temp2[2] = _mm_mul_epi32(temp2[0], cospi_p24_p24);
+ temp2[3] = _mm_mul_epi32(temp2[1], cospi_p24_p24);
+ temp2[0] = _mm_mul_epi32(temp2[0], cospi_p08_p08);
+ temp2[1] = _mm_mul_epi32(temp2[1], cospi_p08_p08);
+ temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); // [1]*cospi_24 - [3]*cospi_8
+ temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); // [1]*cospi_24 - [3]*cospi_8
+ temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); // [1]*cospi_8 + [3]*cospi_24
+ temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); // [1]*cospi_8 + [3]*cospi_24
+ temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+ step[2] = pack_4(temp1[0], temp1[1]);
+ step[3] = pack_4(temp2[0], temp2[1]);
+
+ // stage 2
+ io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
+ io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
+ io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2]
+ io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
+}
+
+void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ __m128i io[4];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 8));
+ io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+ if (bd == 8) {
+ __m128i io_short[2];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[1]);
+ io_short[1] = _mm_packs_epi32(io[2], io[3]);
+ idct4_sse2(io_short);
+ idct4_sse2(io_short);
+ io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+ io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+ io[0] = _mm_srai_epi16(io_short[0], 4);
+ io[1] = _mm_srai_epi16(io_short[1], 4);
+ } else {
+ highbd_idct4(io);
+ highbd_idct4(io);
+ io[0] = wraplow_16bit(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit(io[2], io[3], _mm_set1_epi32(8));
+ }
+
+ recon_and_store_4(dest, io, stride, bd);
+}
#include "vpx_dsp/inv_txfm.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
+static INLINE __m128i wraplow_16bit(const __m128i in0, const __m128i in1,
+ const __m128i rounding) {
+ __m128i temp[2];
+ temp[0] = _mm_add_epi32(in0, rounding);
+ temp[1] = _mm_add_epi32(in1, rounding);
+ temp[0] = _mm_srai_epi32(temp[0], 4);
+ temp[1] = _mm_srai_epi32(temp[1], 4);
+ return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) {
+ const __m128i t = _mm_add_epi64(
+ in,
+ _mm_setr_epi32(DCT_CONST_ROUNDING << 2, 0, DCT_CONST_ROUNDING << 2, 0));
+ return _mm_srli_si128(t, 2);
+}
+
+static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) {
+ const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 2
+ const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 1, 3
+ return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3
+}
+
static INLINE __m128i add_dc_clamp(const __m128i *const min,
const __m128i *const max,
const __m128i *const dc,
return retval;
}
+static INLINE void recon_and_store_4(uint16_t *const dest,
+ const __m128i *const io, const int stride,
+ int bd) {
+ __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+ __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+ d0 =
+ _mm_unpacklo_epi64(d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
+ d2 = _mm_unpacklo_epi64(
+ d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+ d0 = clamp_high_sse2(_mm_adds_epi16(d0, io[0]), bd);
+ d2 = clamp_high_sse2(_mm_adds_epi16(d2, io[1]), bd);
+ _mm_storel_epi64((__m128i *)dest, d0);
+ d0 = _mm_srli_si128(d0, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride), d0);
+ _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+ d2 = _mm_srli_si128(d2, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+}
+
#endif // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
__m128i in[2];
// Rows
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8);
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8);
idct4_sse2(in);
// Columns
*res3 = idct_calc_wraplow_sse2(lo_1, hi_1, *cst3);
}
-static void multiplication_and_add_2(const __m128i *const in0,
- const __m128i *const in1,
- const __m128i *const cst0,
- const __m128i *const cst1,
- __m128i *const res0, __m128i *const res1) {
- const __m128i lo = _mm_unpacklo_epi16(*in0, *in1);
- const __m128i hi = _mm_unpackhi_epi16(*in0, *in1);
- *res0 = idct_calc_wraplow_sse2(lo, hi, *cst0);
- *res1 = idct_calc_wraplow_sse2(lo, hi, *cst1);
-}
-
static INLINE void idct8(const __m128i *const in, __m128i *const out) {
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- /* Stage1 */
- multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &stg1_0, &stg1_1,
- &stg1_2, &stg1_3, &stp1_4, &stp1_7, &stp1_5, &stp1_6);
-
- /* Stage2 */
- multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &stg2_0, &stg2_1,
- &stg2_2, &stg2_3, &stp2_0, &stp2_1, &stp2_2, &stp2_3);
-
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
- /* Stage3 */
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
- multiplication_and_add_2(&stp2_6, &stp2_5, &stg2_1, &stg2_0, &stp1_5,
- &stp1_6);
-
- /* Stage4 */
- out[0] = _mm_add_epi16(stp1_0, stp2_7);
- out[1] = _mm_add_epi16(stp1_1, stp1_6);
- out[2] = _mm_add_epi16(stp1_2, stp1_5);
- out[3] = _mm_add_epi16(stp1_3, stp2_4);
- out[4] = _mm_sub_epi16(stp1_3, stp2_4);
- out[5] = _mm_sub_epi16(stp1_2, stp1_5);
- out[6] = _mm_sub_epi16(stp1_1, stp1_6);
- out[7] = _mm_sub_epi16(stp1_0, stp2_7);
+ const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ __m128i step1[8], step2[8];
+
+ // stage 1
+ {
+ const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ multiplication_and_add(&in[1], &in[7], &in[3], &in[5], &cp_28_n4, &cp_4_28,
+ &cp_n20_12, &cp_12_20, &step1[4], &step1[7],
+ &step1[5], &step1[6]);
+ }
+
+ // stage 2
+ {
+ const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ multiplication_and_add(&in[0], &in[4], &in[2], &in[6], &cp_16_16,
+ &cp_16_n16, &cp_24_n8, &cp_8_24, &step2[0],
+ &step2[1], &step2[2], &step2[3]);
+ }
+
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[1], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16,
+ &step1[5], &step1[6]);
+
+ // stage 4
+ out[0] = _mm_add_epi16(step1[0], step2[7]);
+ out[1] = _mm_add_epi16(step1[1], step1[6]);
+ out[2] = _mm_add_epi16(step1[2], step1[5]);
+ out[3] = _mm_add_epi16(step1[3], step2[4]);
+ out[4] = _mm_sub_epi16(step1[3], step2[4]);
+ out[5] = _mm_sub_epi16(step1[2], step1[5]);
+ out[6] = _mm_sub_epi16(step1[1], step1[6]);
+ out[7] = _mm_sub_epi16(step1[0], step2[7]);
}
void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
write_buffer_8x8(in, dest, stride);
}
+static INLINE void recon_and_store_8_dual(uint8_t *const dest,
+ const __m128i in_x,
+ const int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d0, d1;
+
+ d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride));
+ d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride));
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d1 = _mm_unpacklo_epi8(d1, zero);
+ d0 = _mm_add_epi16(in_x, d0);
+ d1 = _mm_add_epi16(in_x, d1);
+ d0 = _mm_packus_epi16(d0, d1);
+ _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0);
+ _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0));
+}
+
void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
__m128i dc_value;
- int a;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 5);
-
- dc_value = _mm_set1_epi16(a);
-
- recon_and_store(dest + 0 * stride, dc_value);
- recon_and_store(dest + 1 * stride, dc_value);
- recon_and_store(dest + 2 * stride, dc_value);
- recon_and_store(dest + 3 * stride, dc_value);
- recon_and_store(dest + 4 * stride, dc_value);
- recon_and_store(dest + 5 * stride, dc_value);
- recon_and_store(dest + 6 * stride, dc_value);
- recon_and_store(dest + 7 * stride, dc_value);
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 5);
+ dc_value = _mm_set1_epi16(a1);
+
+ recon_and_store_8_dual(dest, dc_value, stride);
+ dest += 2 * stride;
+ recon_and_store_8_dual(dest, dc_value, stride);
+ dest += 2 * stride;
+ recon_and_store_8_dual(dest, dc_value, stride);
+ dest += 2 * stride;
+ recon_and_store_8_dual(dest, dc_value, stride);
}
void idct8_sse2(__m128i *in) {
void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
const __m128i zero = _mm_setzero_si128();
- const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ __m128i in[8], step1[8], step2[8], tmp[4];
- __m128i in[8];
- __m128i stp1_2, stp1_3, stp1_4, stp1_5;
- __m128i stp2_0, stp2_2, stp2_4, stp2_5, stp2_6;
- __m128i tmp[4];
-
- // Rows. Load 4-row input data.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 1);
- in[2] = load_input_data(input + 8 * 2);
- in[3] = load_input_data(input + 8 * 3);
+ in[0] = load_input_data4(input + 0 * 8);
+ in[1] = load_input_data4(input + 1 * 8);
+ in[2] = load_input_data4(input + 2 * 8);
+ in[3] = load_input_data4(input + 3 * 8);
- // 8x4 Transpose
transpose_16bit_4x4(in, in);
- // Stage1
- {
- const __m128i lo_17 = _mm_unpackhi_epi16(in[0], zero);
- const __m128i lo_35 = _mm_unpackhi_epi16(in[1], zero);
+ // in[0]: 00 10 20 30 01 11 21 31
+ // in[1]: 02 12 22 32 03 13 23 33
- stp1_4 = idct_calc_wraplow_sse2(stg1_0, stg1_1, lo_17);
- stp1_5 = idct_calc_wraplow_sse2(stg1_2, stg1_3, lo_35);
+ // stage 1
+ {
+ const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i lo_1 = _mm_unpackhi_epi16(in[0], zero);
+ const __m128i lo_3 = _mm_unpackhi_epi16(in[1], zero);
+ step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1); // step1 4&7
+ step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3); // step1 5&6
}
- // Stage2
+ // stage 2
{
- const __m128i lo_04 = _mm_unpacklo_epi16(in[0], zero);
- const __m128i lo_26 = _mm_unpacklo_epi16(in[1], zero);
-
- stp2_0 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_04);
- stp2_2 = idct_calc_wraplow_sse2(stg2_3, stg2_2, lo_26);
-
- tmp[0] = _mm_add_epi16(stp1_4, stp1_5);
- tmp[1] = _mm_sub_epi16(stp1_4, stp1_5);
-
- stp2_4 = tmp[0];
- stp2_5 = _mm_unpacklo_epi64(tmp[1], zero);
- stp2_6 = _mm_unpackhi_epi64(tmp[1], zero);
+ const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i lo_0 = _mm_unpacklo_epi16(in[0], zero);
+ const __m128i lo_2 = _mm_unpacklo_epi16(in[1], zero);
+ step2[0] = idct_calc_wraplow_sse2(cp_16_16, cp_16_n16, lo_0); // step2 0&1
+ step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2); // step2 3&2
+ step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6
+ step2[6] = _mm_unpackhi_epi64(step2[5], zero); // step2 6
}
- // Stage3
+ // stage 3
{
- const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-
- tmp[0] = _mm_add_epi16(stp2_0, stp2_2);
- tmp[1] = _mm_sub_epi16(stp2_0, stp2_2);
- stp1_2 = _mm_unpackhi_epi64(tmp[1], tmp[0]);
- stp1_3 = _mm_unpacklo_epi64(tmp[1], tmp[0]);
- stp1_5 = idct_calc_wraplow_sse2(stg3_0, stg2_0, lo_56); // stg3_1 = stg2_0
+ const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]);
+ tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1
+ tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2
+ step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1
+ step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0
+ step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65); // step1 5&6
}
- // Stage4
- tmp[0] = _mm_add_epi16(stp1_3, stp2_4);
- tmp[1] = _mm_add_epi16(stp1_2, stp1_5);
- tmp[2] = _mm_sub_epi16(stp1_3, stp2_4);
- tmp[3] = _mm_sub_epi16(stp1_2, stp1_5);
+ // stage 4
+ tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0
+ tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1
+ tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7
+ tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6
idct8x8_12_transpose_16bit_4x8(tmp, in);
in[4] = in[5] = in[6] = in[7] = zero;
write_buffer_8x8(in, dest, stride);
}
-#define IDCT16 \
- /* Stage2 */ \
- multiplication_and_add(&in[1], &in[15], &in[9], &in[7], &stg2_0, &stg2_1, \
- &stg2_2, &stg2_3, &stp2_8, &stp2_15, &stp2_9, \
- &stp2_14); \
- \
- multiplication_and_add(&in[5], &in[11], &in[13], &in[3], &stg2_4, &stg2_5, \
- &stg2_6, &stg2_7, &stp2_10, &stp2_13, &stp2_11, \
- &stp2_12); \
- \
- /* Stage3 */ \
- multiplication_and_add(&in[2], &in[14], &in[10], &in[6], &stg3_0, &stg3_1, \
- &stg3_2, &stg3_3, &stp1_4, &stp1_7, &stp1_5, \
- &stp1_6); \
- \
- stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- \
- stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- \
- /* Stage4 */ \
- multiplication_and_add(&in[0], &in[8], &in[4], &in[12], &stg4_0, &stg4_1, \
- &stg4_2, &stg4_3, &stp2_0, &stp2_1, &stp2_2, \
- &stp2_3); \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- multiplication_and_add(&stp1_9, &stp1_14, &stp1_10, &stp1_13, &stg4_4, \
- &stg4_5, &stg4_6, &stg4_7, &stp2_9, &stp2_14, \
- &stp2_10, &stp2_13); \
- \
- /* Stage5 */ \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- multiplication_and_add_2(&stp2_6, &stp2_5, &stg4_1, &stg4_0, &stp1_5, \
- &stp1_6); \
- \
- stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
- \
- stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
- \
- /* Stage6 */ \
- stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
- \
- multiplication_and_add(&stp1_10, &stp1_13, &stp1_11, &stp1_12, &stg6_0, \
- &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13, \
- &stp2_11, &stp2_12);
-
#define IDCT16_10 \
/* Stage2 */ \
multiplication_and_add(&in[1], &zero, &zero, &in[3], &stg2_0, &stg2_1, \
&stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13, \
&stp2_11, &stp2_12);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+static INLINE void idct16_8col(__m128i *const in) {
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i s[16], t[16];
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ // stage 2
+ {
+ const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ multiplication_and_add(&in[1], &in[15], &in[9], &in[7], &k__cospi_p30_m02,
+ &k__cospi_p02_p30, &k__cospi_p14_m18,
+ &k__cospi_p18_p14, &s[8], &s[15], &s[9], &s[14]);
+ }
+ {
+ const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ multiplication_and_add(&in[5], &in[11], &in[13], &in[3], &k__cospi_p22_m10,
+ &k__cospi_p10_p22, &k__cospi_p06_m26,
+ &k__cospi_p26_p06, &s[10], &s[13], &s[11], &s[12]);
+ }
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ // stage 3
+ {
+ const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ multiplication_and_add(&in[2], &in[14], &in[10], &in[6], &k__cospi_p28_m04,
+ &k__cospi_p04_p28, &k__cospi_p12_m20,
+ &k__cospi_p20_p12, &t[4], &t[7], &t[5], &t[6]);
+ }
+ t[8] = _mm_add_epi16(s[8], s[9]);
+ t[9] = _mm_sub_epi16(s[8], s[9]);
+ t[10] = _mm_sub_epi16(s[11], s[10]);
+ t[11] = _mm_add_epi16(s[10], s[11]);
+ t[12] = _mm_add_epi16(s[12], s[13]);
+ t[13] = _mm_sub_epi16(s[12], s[13]);
+ t[14] = _mm_sub_epi16(s[15], s[14]);
+ t[15] = _mm_add_epi16(s[14], s[15]);
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ // stage 4
+ {
+ const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ multiplication_and_add(&in[0], &in[8], &in[4], &in[12], &k__cospi_p16_p16,
+ &k__cospi_p16_m16, &k__cospi_p24_m08,
+ &k__cospi_p08_p24, &s[0], &s[1], &s[2], &s[3]);
+ }
+ s[5] = _mm_sub_epi16(t[4], t[5]);
+ t[4] = _mm_add_epi16(t[4], t[5]);
+ s[6] = _mm_sub_epi16(t[7], t[6]);
+ t[7] = _mm_add_epi16(t[6], t[7]);
+ s[8] = t[8];
+ {
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ multiplication_and_add(&t[9], &t[14], &t[10], &t[13], &k__cospi_m08_p24,
+ &k__cospi_p24_p08, &k__cospi_m24_m08,
+ &k__cospi_m08_p24, &s[9], &s[14], &s[10], &s[13]);
+ }
+ s[11] = t[11];
+ s[12] = t[12];
+ s[15] = t[15];
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ // stage 5
+ t[0] = _mm_add_epi16(s[0], s[3]);
+ t[1] = _mm_add_epi16(s[1], s[2]);
+ t[2] = _mm_sub_epi16(s[1], s[2]);
+ t[3] = _mm_sub_epi16(s[0], s[3]);
+ multiplication_and_add_2(&s[5], &s[6], &k__cospi_m16_p16, &k__cospi_p16_p16,
+ &t[5], &t[6]);
+ t[8] = _mm_add_epi16(s[8], s[11]);
+ t[9] = _mm_add_epi16(s[9], s[10]);
+ t[10] = _mm_sub_epi16(s[9], s[10]);
+ t[11] = _mm_sub_epi16(s[8], s[11]);
+ t[12] = _mm_sub_epi16(s[15], s[12]);
+ t[13] = _mm_sub_epi16(s[14], s[13]);
+ t[14] = _mm_add_epi16(s[13], s[14]);
+ t[15] = _mm_add_epi16(s[12], s[15]);
- __m128i in[16], l[16], r[16], *curr1;
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_8_0, stp1_12_0;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+ // stage 6
+ s[0] = _mm_add_epi16(t[0], t[7]);
+ s[1] = _mm_add_epi16(t[1], t[6]);
+ s[2] = _mm_add_epi16(t[2], t[5]);
+ s[3] = _mm_add_epi16(t[3], t[4]);
+ s[4] = _mm_sub_epi16(t[3], t[4]);
+ s[5] = _mm_sub_epi16(t[2], t[5]);
+ s[6] = _mm_sub_epi16(t[1], t[6]);
+ s[7] = _mm_sub_epi16(t[0], t[7]);
+ multiplication_and_add(&t[10], &t[13], &t[11], &t[12], &k__cospi_m16_p16,
+ &k__cospi_p16_p16, &k__cospi_m16_p16,
+ &k__cospi_p16_p16, &s[10], &s[13], &s[11], &s[12]);
+
+ // stage 7
+ in[0] = _mm_add_epi16(s[0], t[15]);
+ in[1] = _mm_add_epi16(s[1], t[14]);
+ in[2] = _mm_add_epi16(s[2], s[13]);
+ in[3] = _mm_add_epi16(s[3], s[12]);
+ in[4] = _mm_add_epi16(s[4], s[11]);
+ in[5] = _mm_add_epi16(s[5], s[10]);
+ in[6] = _mm_add_epi16(s[6], t[9]);
+ in[7] = _mm_add_epi16(s[7], t[8]);
+ in[8] = _mm_sub_epi16(s[7], t[8]);
+ in[9] = _mm_sub_epi16(s[6], t[9]);
+ in[10] = _mm_sub_epi16(s[5], s[10]);
+ in[11] = _mm_sub_epi16(s[4], s[11]);
+ in[12] = _mm_sub_epi16(s[3], s[12]);
+ in[13] = _mm_sub_epi16(s[2], s[13]);
+ in[14] = _mm_sub_epi16(s[1], t[14]);
+ in[15] = _mm_sub_epi16(s[0], t[15]);
+}
+
+static INLINE void idct16_load8x8(const tran_low_t *const input,
+ __m128i *const in) {
+ in[0] = load_input_data8(input);
+ in[1] = load_input_data8(input + 8 * 2);
+ in[2] = load_input_data8(input + 8 * 4);
+ in[3] = load_input_data8(input + 8 * 6);
+ in[4] = load_input_data8(input + 8 * 8);
+ in[5] = load_input_data8(input + 8 * 10);
+ in[6] = load_input_data8(input + 8 * 12);
+ in[7] = load_input_data8(input + 8 * 14);
+}
+
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i l[16], r[16], out[16], *in;
int i;
- curr1 = l;
+ in = l;
for (i = 0; i < 2; i++) {
- // 1-D idct
-
- // Load input data.
- in[0] = load_input_data(input);
- in[8] = load_input_data(input + 8 * 1);
- in[1] = load_input_data(input + 8 * 2);
- in[9] = load_input_data(input + 8 * 3);
- in[2] = load_input_data(input + 8 * 4);
- in[10] = load_input_data(input + 8 * 5);
- in[3] = load_input_data(input + 8 * 6);
- in[11] = load_input_data(input + 8 * 7);
- in[4] = load_input_data(input + 8 * 8);
- in[12] = load_input_data(input + 8 * 9);
- in[5] = load_input_data(input + 8 * 10);
- in[13] = load_input_data(input + 8 * 11);
- in[6] = load_input_data(input + 8 * 12);
- in[14] = load_input_data(input + 8 * 13);
- in[7] = load_input_data(input + 8 * 14);
- in[15] = load_input_data(input + 8 * 15);
-
+ idct16_load8x8(input, in);
transpose_16bit_8x8(in, in);
+ idct16_load8x8(input + 8, in + 8);
transpose_16bit_8x8(in + 8, in + 8);
-
- IDCT16
-
- // Stage7
- curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
- curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
- curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
- curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
- curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
- curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
- curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
- curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
- curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
- curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
- curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
- curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
- curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
- curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
- curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
- curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
- curr1 = r;
+ idct16_8col(in);
+ in = r;
input += 128;
}
+
for (i = 0; i < 2; i++) {
int j;
- // 1-D idct
- transpose_16bit_8x8(l + i * 8, in);
- transpose_16bit_8x8(r + i * 8, in + 8);
-
- IDCT16
-
- // 2-D
- in[0] = _mm_add_epi16(stp2_0, stp1_15);
- in[1] = _mm_add_epi16(stp2_1, stp1_14);
- in[2] = _mm_add_epi16(stp2_2, stp2_13);
- in[3] = _mm_add_epi16(stp2_3, stp2_12);
- in[4] = _mm_add_epi16(stp2_4, stp2_11);
- in[5] = _mm_add_epi16(stp2_5, stp2_10);
- in[6] = _mm_add_epi16(stp2_6, stp1_9);
- in[7] = _mm_add_epi16(stp2_7, stp1_8);
- in[8] = _mm_sub_epi16(stp2_7, stp1_8);
- in[9] = _mm_sub_epi16(stp2_6, stp1_9);
- in[10] = _mm_sub_epi16(stp2_5, stp2_10);
- in[11] = _mm_sub_epi16(stp2_4, stp2_11);
- in[12] = _mm_sub_epi16(stp2_3, stp2_12);
- in[13] = _mm_sub_epi16(stp2_2, stp2_13);
- in[14] = _mm_sub_epi16(stp2_1, stp1_14);
- in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+ transpose_16bit_8x8(l + i * 8, out);
+ transpose_16bit_8x8(r + i * 8, out + 8);
+ idct16_8col(out);
+ // Final rounding and shift
for (j = 0; j < 16; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- recon_and_store(dest + j * stride, in[j]);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ out[j] = _mm_adds_epi16(out[j], final_rounding);
+ out[j] = _mm_srai_epi16(out[j], 6);
+ recon_and_store(dest + j * stride, out[j]);
}
dest += 8;
}
}
+static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i d0, d1;
+
+ d0 = _mm_load_si128((__m128i *)(dest));
+ d1 = _mm_unpackhi_epi8(d0, zero);
+ d0 = _mm_unpacklo_epi8(d0, zero);
+ d0 = _mm_add_epi16(in_x, d0);
+ d1 = _mm_add_epi16(in_x, d1);
+ d0 = _mm_packus_epi16(d0, d1);
+ _mm_store_si128((__m128i *)(dest), d0);
+}
+
void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
__m128i dc_value;
- int a, i;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 6);
+ int i;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- dc_value = _mm_set1_epi16(a);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ dc_value = _mm_set1_epi16(a1);
for (i = 0; i < 16; ++i) {
- recon_and_store(dest + 0, dc_value);
- recon_and_store(dest + 8, dc_value);
+ recon_and_store_16(dest, dc_value);
dest += stride;
}
}
in[15] = _mm_sub_epi16(kZero, s[1]);
}
-static void idct16_8col(__m128i *in) {
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- __m128i u[16], s[16], t[16];
-
- // stage 1
- s[0] = in[0];
- s[1] = in[8];
- s[2] = in[4];
- s[3] = in[12];
- s[4] = in[2];
- s[5] = in[10];
- s[6] = in[6];
- s[7] = in[14];
- s[8] = in[1];
- s[9] = in[9];
- s[10] = in[5];
- s[11] = in[13];
- s[12] = in[3];
- s[13] = in[11];
- s[14] = in[7];
- s[15] = in[15];
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[15]);
- u[1] = _mm_unpackhi_epi16(s[8], s[15]);
- u[2] = _mm_unpacklo_epi16(s[9], s[14]);
- u[3] = _mm_unpackhi_epi16(s[9], s[14]);
- u[4] = _mm_unpacklo_epi16(s[10], s[13]);
- u[5] = _mm_unpackhi_epi16(s[10], s[13]);
- u[6] = _mm_unpacklo_epi16(s[11], s[12]);
- u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
- s[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p30_m02);
- s[15] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p02_p30);
- s[9] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p14_m18);
- s[14] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p18_p14);
- s[10] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p22_m10);
- s[13] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p10_p22);
- s[11] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p06_m26);
- s[12] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p26_p06);
-
- // stage 3
- t[0] = s[0];
- t[1] = s[1];
- t[2] = s[2];
- t[3] = s[3];
- u[0] = _mm_unpacklo_epi16(s[4], s[7]);
- u[1] = _mm_unpackhi_epi16(s[4], s[7]);
- u[2] = _mm_unpacklo_epi16(s[5], s[6]);
- u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
- t[4] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p28_m04);
- t[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p04_p28);
- t[5] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p12_m20);
- t[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p20_p12);
- t[8] = _mm_add_epi16(s[8], s[9]);
- t[9] = _mm_sub_epi16(s[8], s[9]);
- t[10] = _mm_sub_epi16(s[11], s[10]);
- t[11] = _mm_add_epi16(s[10], s[11]);
- t[12] = _mm_add_epi16(s[12], s[13]);
- t[13] = _mm_sub_epi16(s[12], s[13]);
- t[14] = _mm_sub_epi16(s[15], s[14]);
- t[15] = _mm_add_epi16(s[14], s[15]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(t[0], t[1]);
- u[1] = _mm_unpackhi_epi16(t[0], t[1]);
- u[2] = _mm_unpacklo_epi16(t[2], t[3]);
- u[3] = _mm_unpackhi_epi16(t[2], t[3]);
- u[4] = _mm_unpacklo_epi16(t[9], t[14]);
- u[5] = _mm_unpackhi_epi16(t[9], t[14]);
- u[6] = _mm_unpacklo_epi16(t[10], t[13]);
- u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
- s[0] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
- s[1] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
- s[2] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p24_m08);
- s[3] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p08_p24);
- s[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m08_p24);
- s[14] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p24_p08);
- s[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m24_m08);
- s[13] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m08_p24);
- s[4] = _mm_add_epi16(t[4], t[5]);
- s[5] = _mm_sub_epi16(t[4], t[5]);
- s[6] = _mm_sub_epi16(t[7], t[6]);
- s[7] = _mm_add_epi16(t[6], t[7]);
- s[8] = t[8];
- s[15] = t[15];
- s[11] = t[11];
- s[12] = t[12];
-
- // stage 5
- t[0] = _mm_add_epi16(s[0], s[3]);
- t[1] = _mm_add_epi16(s[1], s[2]);
- t[2] = _mm_sub_epi16(s[1], s[2]);
- t[3] = _mm_sub_epi16(s[0], s[3]);
- t[4] = s[4];
- t[7] = s[7];
-
- multiplication_and_add_2(&s[5], &s[6], &k__cospi_m16_p16, &k__cospi_p16_p16,
- &t[5], &t[6]);
-
- t[8] = _mm_add_epi16(s[8], s[11]);
- t[9] = _mm_add_epi16(s[9], s[10]);
- t[10] = _mm_sub_epi16(s[9], s[10]);
- t[11] = _mm_sub_epi16(s[8], s[11]);
- t[12] = _mm_sub_epi16(s[15], s[12]);
- t[13] = _mm_sub_epi16(s[14], s[13]);
- t[14] = _mm_add_epi16(s[13], s[14]);
- t[15] = _mm_add_epi16(s[12], s[15]);
-
- // stage 6
- s[0] = _mm_add_epi16(t[0], t[7]);
- s[1] = _mm_add_epi16(t[1], t[6]);
- s[2] = _mm_add_epi16(t[2], t[5]);
- s[3] = _mm_add_epi16(t[3], t[4]);
- s[4] = _mm_sub_epi16(t[3], t[4]);
- s[5] = _mm_sub_epi16(t[2], t[5]);
- s[6] = _mm_sub_epi16(t[1], t[6]);
- s[7] = _mm_sub_epi16(t[0], t[7]);
- s[8] = t[8];
- s[9] = t[9];
-
- u[0] = _mm_unpacklo_epi16(t[10], t[13]);
- u[1] = _mm_unpackhi_epi16(t[10], t[13]);
- u[2] = _mm_unpacklo_epi16(t[11], t[12]);
- u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
- s[10] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_p16);
- s[13] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
- s[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
- s[12] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
- s[14] = t[14];
- s[15] = t[15];
-
- // stage 7
- in[0] = _mm_add_epi16(s[0], s[15]);
- in[1] = _mm_add_epi16(s[1], s[14]);
- in[2] = _mm_add_epi16(s[2], s[13]);
- in[3] = _mm_add_epi16(s[3], s[12]);
- in[4] = _mm_add_epi16(s[4], s[11]);
- in[5] = _mm_add_epi16(s[5], s[10]);
- in[6] = _mm_add_epi16(s[6], s[9]);
- in[7] = _mm_add_epi16(s[7], s[8]);
- in[8] = _mm_sub_epi16(s[7], s[8]);
- in[9] = _mm_sub_epi16(s[6], s[9]);
- in[10] = _mm_sub_epi16(s[5], s[10]);
- in[11] = _mm_sub_epi16(s[4], s[11]);
- in[12] = _mm_sub_epi16(s[3], s[12]);
- in[13] = _mm_sub_epi16(s[2], s[13]);
- in[14] = _mm_sub_epi16(s[1], s[14]);
- in[15] = _mm_sub_epi16(s[0], s[15]);
-}
-
void idct16_sse2(__m128i *in0, __m128i *in1) {
transpose_16bit_16x16(in0, in1);
idct16_8col(in0);
int i;
// First 1-D inverse DCT
// Load input data.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 2);
- in[2] = load_input_data(input + 8 * 4);
- in[3] = load_input_data(input + 8 * 6);
+ in[0] = load_input_data4(input + 0 * 16);
+ in[1] = load_input_data4(input + 1 * 16);
+ in[2] = load_input_data4(input + 2 * 16);
+ in[3] = load_input_data4(input + 3 * 16);
transpose_16bit_4x4(in, in);
}
}
-#define LOAD_DQCOEFF(reg, input) \
- { \
- reg = load_input_data(input); \
- input += 8; \
- }
-
#define IDCT32_34 \
/* Stage1 */ \
multiplication_and_add_2(&in[1], &zero, &stg1_0, &stg1_1, &stp1_16, \
stp1_30 = stp2_30; \
stp1_31 = stp2_31;
-#define IDCT32 \
- /* Stage1 */ \
- multiplication_and_add(&in[1], &in[31], &in[17], &in[15], &stg1_0, &stg1_1, \
- &stg1_2, &stg1_3, &stp1_16, &stp1_31, &stp1_17, \
- &stp1_30); \
- multiplication_and_add(&in[9], &in[23], &in[25], &in[7], &stg1_4, &stg1_5, \
- &stg1_6, &stg1_7, &stp1_18, &stp1_29, &stp1_19, \
- &stp1_28); \
- multiplication_and_add(&in[5], &in[27], &in[21], &in[11], &stg1_8, &stg1_9, \
- &stg1_10, &stg1_11, &stp1_20, &stp1_27, &stp1_21, \
- &stp1_26); \
- multiplication_and_add(&in[13], &in[19], &in[29], &in[3], &stg1_12, \
- &stg1_13, &stg1_14, &stg1_15, &stp1_22, &stp1_25, \
- &stp1_23, &stp1_24); \
- \
- /* Stage2 */ \
- multiplication_and_add(&in[2], &in[30], &in[18], &in[14], &stg2_0, &stg2_1, \
- &stg2_2, &stg2_3, &stp2_8, &stp2_15, &stp2_9, \
- &stp2_14); \
- multiplication_and_add(&in[10], &in[22], &in[26], &in[6], &stg2_4, &stg2_5, \
- &stg2_6, &stg2_7, &stp2_10, &stp2_13, &stp2_11, \
- &stp2_12); \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
- stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
- stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
- \
- stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
- stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
- stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
- stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
- \
- stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
- stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
- stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
- \
- /* Stage3 */ \
- multiplication_and_add(&in[4], &in[28], &in[20], &in[12], &stg3_0, &stg3_1, \
- &stg3_2, &stg3_3, &stp1_4, &stp1_7, &stp1_5, \
- &stp1_6); \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
- stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
- stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
- stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
- stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
- \
- multiplication_and_add(&stp2_17, &stp2_30, &stp2_18, &stp2_29, &stg3_4, \
- &stg3_5, &stg3_6, &stg3_4, &stp1_17, &stp1_30, \
- &stp1_18, &stp1_29); \
- multiplication_and_add(&stp2_21, &stp2_26, &stp2_22, &stp2_25, &stg3_8, \
- &stg3_9, &stg3_10, &stg3_8, &stp1_21, &stp1_26, \
- &stp1_22, &stp1_25); \
- \
- stp1_16 = stp2_16; \
- stp1_31 = stp2_31; \
- stp1_19 = stp2_19; \
- stp1_20 = stp2_20; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_27 = stp2_27; \
- stp1_28 = stp2_28; \
- \
- /* Stage4 */ \
- multiplication_and_add(&in[0], &in[16], &in[8], &in[24], &stg4_0, &stg4_1, \
- &stg4_2, &stg4_3, &stp2_0, &stp2_1, &stp2_2, \
- &stp2_3); \
- \
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
- \
- multiplication_and_add(&stp1_9, &stp1_14, &stp1_10, &stp1_13, &stg4_4, \
- &stg4_5, &stg4_6, &stg4_4, &stp2_9, &stp2_14, \
- &stp2_10, &stp2_13); \
- \
- stp2_8 = stp1_8; \
- stp2_15 = stp1_15; \
- stp2_11 = stp1_11; \
- stp2_12 = stp1_12; \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
- stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
- stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
- stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
- stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
- stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
- \
- stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
- stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
- stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
- stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
- stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
- \
- /* Stage5 */ \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
- multiplication_and_add_2(&stp2_6, &stp2_5, &stg4_1, &stg4_0, &stp1_5, \
- &stp1_6); \
- \
- stp1_4 = stp2_4; \
- stp1_7 = stp2_7; \
- \
- stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
- stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
- stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
- stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
- stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- \
- multiplication_and_add(&stp2_18, &stp2_29, &stp2_19, &stp2_28, &stg4_4, \
- &stg4_5, &stg4_4, &stg4_5, &stp1_18, &stp1_29, \
- &stp1_19, &stp1_28); \
- multiplication_and_add(&stp2_20, &stp2_27, &stp2_21, &stp2_26, &stg4_6, \
- &stg4_4, &stg4_6, &stg4_4, &stp1_20, &stp1_27, \
- &stp1_21, &stp1_26); \
- \
- stp1_22 = stp2_22; \
- stp1_23 = stp2_23; \
- stp1_24 = stp2_24; \
- stp1_25 = stp2_25; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31; \
- \
- /* Stage6 */ \
- stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
- stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
- stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
- stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
- stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
- stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
- stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
- stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
- \
- stp2_8 = stp1_8; \
- stp2_9 = stp1_9; \
- stp2_14 = stp1_14; \
- stp2_15 = stp1_15; \
- \
- multiplication_and_add(&stp1_10, &stp1_13, &stp1_11, &stp1_12, &stg6_0, \
- &stg4_0, &stg6_0, &stg4_0, &stp2_10, &stp2_13, \
- &stp2_11, &stp2_12); \
- \
- stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
- stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
- stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
- stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
- stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
- stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
- stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
- stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
- \
- stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
- stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
- stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
- stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
- stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
- stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
- stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
- stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
- \
- /* Stage7 */ \
- stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
- stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
- stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
- stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
- stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
- stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
- stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
- stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
- stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
- stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
- stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
- stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
- stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
- stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
- stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
- stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
- \
- stp1_16 = stp2_16; \
- stp1_17 = stp2_17; \
- stp1_18 = stp2_18; \
- stp1_19 = stp2_19; \
- \
- multiplication_and_add(&stp2_20, &stp2_27, &stp2_21, &stp2_26, &stg6_0, \
- &stg4_0, &stg6_0, &stg4_0, &stp1_20, &stp1_27, \
- &stp1_21, &stp1_26); \
- multiplication_and_add(&stp2_22, &stp2_25, &stp2_23, &stp2_24, &stg6_0, \
- &stg4_0, &stg6_0, &stg4_0, &stp1_22, &stp1_25, \
- &stp1_23, &stp1_24); \
- \
- stp1_28 = stp2_28; \
- stp1_29 = stp2_29; \
- stp1_30 = stp2_30; \
- stp1_31 = stp2_31;
-
// Only upper-left 8x8 has non-zero coeff
void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
int i;
// Load input data. Only need to load the top left 8x8 block.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 32);
- in[2] = load_input_data(input + 64);
- in[3] = load_input_data(input + 96);
- in[4] = load_input_data(input + 128);
- in[5] = load_input_data(input + 160);
- in[6] = load_input_data(input + 192);
- in[7] = load_input_data(input + 224);
+ in[0] = load_input_data8(input + 0 * 32);
+ in[1] = load_input_data8(input + 1 * 32);
+ in[2] = load_input_data8(input + 2 * 32);
+ in[3] = load_input_data8(input + 3 * 32);
+ in[4] = load_input_data8(input + 4 * 32);
+ in[5] = load_input_data8(input + 5 * 32);
+ in[6] = load_input_data8(input + 6 * 32);
+ in[7] = load_input_data8(input + 7 * 32);
transpose_16bit_8x8(in, in);
IDCT32_34
}
}
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
- int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- const __m128i zero = _mm_setzero_si128();
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i in[32]
+static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[8]*/) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_
- // idct constants for each stage
- const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ {
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
+ butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
+ }
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ v4 = _mm_add_epi16(u4, u5);
+ v5 = _mm_sub_epi16(u4, u5);
+ v6 = _mm_sub_epi16(u7, u6);
+ v7 = _mm_add_epi16(u7, u6);
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
+
+ butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
+ butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
+ }
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ v0 = _mm_add_epi16(u0, u3);
+ v1 = _mm_add_epi16(u1, u2);
+ v2 = _mm_sub_epi16(u1, u2);
+ v3 = _mm_sub_epi16(u0, u3);
+
+ out[0] = _mm_add_epi16(v0, v7);
+ out[1] = _mm_add_epi16(v1, v6);
+ out[2] = _mm_add_epi16(v2, v5);
+ out[3] = _mm_add_epi16(v3, v4);
+ out[4] = _mm_sub_epi16(v3, v4);
+ out[5] = _mm_sub_epi16(v2, v5);
+ out[6] = _mm_sub_epi16(v1, v6);
+ out[7] = _mm_sub_epi16(v0, v7);
+}
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i in[32]
+static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[16]*/) {
+ __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_
+ __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_
- __m128i in[32], col[128], zero_idx[16];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
- stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
- stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
- stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
- stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
- stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
- stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
- int i, j, i32;
+ {
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
+ butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
+ }
- for (i = 0; i < 4; i++) {
- i32 = (i << 5);
- // First 1-D idct
- // Load input data.
- LOAD_DQCOEFF(in[0], input);
- LOAD_DQCOEFF(in[8], input);
- LOAD_DQCOEFF(in[16], input);
- LOAD_DQCOEFF(in[24], input);
- LOAD_DQCOEFF(in[1], input);
- LOAD_DQCOEFF(in[9], input);
- LOAD_DQCOEFF(in[17], input);
- LOAD_DQCOEFF(in[25], input);
- LOAD_DQCOEFF(in[2], input);
- LOAD_DQCOEFF(in[10], input);
- LOAD_DQCOEFF(in[18], input);
- LOAD_DQCOEFF(in[26], input);
- LOAD_DQCOEFF(in[3], input);
- LOAD_DQCOEFF(in[11], input);
- LOAD_DQCOEFF(in[19], input);
- LOAD_DQCOEFF(in[27], input);
-
- LOAD_DQCOEFF(in[4], input);
- LOAD_DQCOEFF(in[12], input);
- LOAD_DQCOEFF(in[20], input);
- LOAD_DQCOEFF(in[28], input);
- LOAD_DQCOEFF(in[5], input);
- LOAD_DQCOEFF(in[13], input);
- LOAD_DQCOEFF(in[21], input);
- LOAD_DQCOEFF(in[29], input);
- LOAD_DQCOEFF(in[6], input);
- LOAD_DQCOEFF(in[14], input);
- LOAD_DQCOEFF(in[22], input);
- LOAD_DQCOEFF(in[30], input);
- LOAD_DQCOEFF(in[7], input);
- LOAD_DQCOEFF(in[15], input);
- LOAD_DQCOEFF(in[23], input);
- LOAD_DQCOEFF(in[31], input);
-
- // checking if all entries are zero
- zero_idx[0] = _mm_or_si128(in[0], in[1]);
- zero_idx[1] = _mm_or_si128(in[2], in[3]);
- zero_idx[2] = _mm_or_si128(in[4], in[5]);
- zero_idx[3] = _mm_or_si128(in[6], in[7]);
- zero_idx[4] = _mm_or_si128(in[8], in[9]);
- zero_idx[5] = _mm_or_si128(in[10], in[11]);
- zero_idx[6] = _mm_or_si128(in[12], in[13]);
- zero_idx[7] = _mm_or_si128(in[14], in[15]);
- zero_idx[8] = _mm_or_si128(in[16], in[17]);
- zero_idx[9] = _mm_or_si128(in[18], in[19]);
- zero_idx[10] = _mm_or_si128(in[20], in[21]);
- zero_idx[11] = _mm_or_si128(in[22], in[23]);
- zero_idx[12] = _mm_or_si128(in[24], in[25]);
- zero_idx[13] = _mm_or_si128(in[26], in[27]);
- zero_idx[14] = _mm_or_si128(in[28], in[29]);
- zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
- zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
- zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
- zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
- zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
- zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
- zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
- zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
- zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
- zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
- zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
- zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
- zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
- zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
- zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
- zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
- if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
- col[i32 + 0] = _mm_setzero_si128();
- col[i32 + 1] = _mm_setzero_si128();
- col[i32 + 2] = _mm_setzero_si128();
- col[i32 + 3] = _mm_setzero_si128();
- col[i32 + 4] = _mm_setzero_si128();
- col[i32 + 5] = _mm_setzero_si128();
- col[i32 + 6] = _mm_setzero_si128();
- col[i32 + 7] = _mm_setzero_si128();
- col[i32 + 8] = _mm_setzero_si128();
- col[i32 + 9] = _mm_setzero_si128();
- col[i32 + 10] = _mm_setzero_si128();
- col[i32 + 11] = _mm_setzero_si128();
- col[i32 + 12] = _mm_setzero_si128();
- col[i32 + 13] = _mm_setzero_si128();
- col[i32 + 14] = _mm_setzero_si128();
- col[i32 + 15] = _mm_setzero_si128();
- col[i32 + 16] = _mm_setzero_si128();
- col[i32 + 17] = _mm_setzero_si128();
- col[i32 + 18] = _mm_setzero_si128();
- col[i32 + 19] = _mm_setzero_si128();
- col[i32 + 20] = _mm_setzero_si128();
- col[i32 + 21] = _mm_setzero_si128();
- col[i32 + 22] = _mm_setzero_si128();
- col[i32 + 23] = _mm_setzero_si128();
- col[i32 + 24] = _mm_setzero_si128();
- col[i32 + 25] = _mm_setzero_si128();
- col[i32 + 26] = _mm_setzero_si128();
- col[i32 + 27] = _mm_setzero_si128();
- col[i32 + 28] = _mm_setzero_si128();
- col[i32 + 29] = _mm_setzero_si128();
- col[i32 + 30] = _mm_setzero_si128();
- col[i32 + 31] = _mm_setzero_si128();
- continue;
- }
+ v8 = _mm_add_epi16(u8, u9);
+ v9 = _mm_sub_epi16(u8, u9);
+ v14 = _mm_sub_epi16(u15, u14);
+ v15 = _mm_add_epi16(u15, u14);
+
+ {
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+ butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
+ butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
+ }
+
+ v10 = _mm_sub_epi16(u11, u10);
+ v11 = _mm_add_epi16(u11, u10);
+ v12 = _mm_add_epi16(u12, u13);
+ v13 = _mm_sub_epi16(u12, u13);
+
+ {
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
+ butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
+ }
+
+ out[0] = _mm_add_epi16(v8, v11);
+ out[1] = _mm_add_epi16(v9, v10);
+ out[6] = _mm_add_epi16(v14, v13);
+ out[7] = _mm_add_epi16(v15, v12);
+
+ out[2] = _mm_sub_epi16(v9, v10);
+ out[3] = _mm_sub_epi16(v8, v11);
+ out[4] = _mm_sub_epi16(v15, v12);
+ out[5] = _mm_sub_epi16(v14, v13);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
+ butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
+ }
+}
+
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i in[32]
+// We avoid hide an offset, 16, inside this function. So we output 0-15 into
+// array out[16]
+static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[16]*/) {
+ __m128i v16, v17, v18, v19, v20, v21, v22, v23;
+ __m128i v24, v25, v26, v27, v28, v29, v30, v31;
+ __m128i u16, u17, u18, u19, u20, u21, u22, u23;
+ __m128i u24, u25, u26, u27, u28, u29, u30, u31;
+
+ {
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+ butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
+ butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
+ butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
+ butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
+
+ butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
+ butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
+
+ butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
+ butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
+ }
+
+ v16 = _mm_add_epi16(u16, u17);
+ v17 = _mm_sub_epi16(u16, u17);
+ v18 = _mm_sub_epi16(u19, u18);
+ v19 = _mm_add_epi16(u19, u18);
+
+ v20 = _mm_add_epi16(u20, u21);
+ v21 = _mm_sub_epi16(u20, u21);
+ v22 = _mm_sub_epi16(u23, u22);
+ v23 = _mm_add_epi16(u23, u22);
+
+ v24 = _mm_add_epi16(u24, u25);
+ v25 = _mm_sub_epi16(u24, u25);
+ v26 = _mm_sub_epi16(u27, u26);
+ v27 = _mm_add_epi16(u27, u26);
+
+ v28 = _mm_add_epi16(u28, u29);
+ v29 = _mm_sub_epi16(u28, u29);
+ v30 = _mm_sub_epi16(u31, u30);
+ v31 = _mm_add_epi16(u31, u30);
+
+ {
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+ butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
+ butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
+ butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
+ butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
+ }
+
+ u16 = _mm_add_epi16(v16, v19);
+ u17 = _mm_add_epi16(v17, v18);
+ u18 = _mm_sub_epi16(v17, v18);
+ u19 = _mm_sub_epi16(v16, v19);
+ u20 = _mm_sub_epi16(v23, v20);
+ u21 = _mm_sub_epi16(v22, v21);
+ u22 = _mm_add_epi16(v22, v21);
+ u23 = _mm_add_epi16(v23, v20);
+
+ u24 = _mm_add_epi16(v24, v27);
+ u25 = _mm_add_epi16(v25, v26);
+ u26 = _mm_sub_epi16(v25, v26);
+ u27 = _mm_sub_epi16(v24, v27);
+
+ u28 = _mm_sub_epi16(v31, v28);
+ u29 = _mm_sub_epi16(v30, v29);
+ u30 = _mm_add_epi16(v29, v30);
+ u31 = _mm_add_epi16(v28, v31);
+
+ {
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
+ butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
+ butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
+ butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
+ }
+
+ out[0] = _mm_add_epi16(u16, u23);
+ out[1] = _mm_add_epi16(u17, u22);
+ out[2] = _mm_add_epi16(u18, u21);
+ out[3] = _mm_add_epi16(u19, u20);
+ out[4] = _mm_sub_epi16(u19, u20);
+ out[5] = _mm_sub_epi16(u18, u21);
+ out[6] = _mm_sub_epi16(u17, u22);
+ out[7] = _mm_sub_epi16(u16, u23);
+
+ out[8] = _mm_sub_epi16(u31, u24);
+ out[9] = _mm_sub_epi16(u30, u25);
+ out[10] = _mm_sub_epi16(u29, u26);
+ out[11] = _mm_sub_epi16(u28, u27);
+ out[12] = _mm_add_epi16(u27, u28);
+ out[13] = _mm_add_epi16(u26, u29);
+ out[14] = _mm_add_epi16(u25, u30);
+ out[15] = _mm_add_epi16(u24, u31);
+
+ {
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
+ butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
+ butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
+ butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
+ }
+}
+
+static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[32]*/) {
+ __m128i temp[16];
+ idct32_full_8x32_quarter_1(in, temp);
+ idct32_full_8x32_quarter_2(in, &temp[8]);
+ add_sub_butterfly(temp, out, 16);
+}
+
+static void idct32_full_8x32(const __m128i *in /*in[32]*/,
+ __m128i *out /*out[32]*/) {
+ __m128i temp[32];
+ idct32_full_8x32_quarter_1_2(in, temp);
+ idct32_full_8x32_quarter_3_4(in, &temp[16]);
+ add_sub_butterfly(temp, out, 32);
+}
+
+static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ in[i] = load_input_data8(input);
+ in[i + 8] = load_input_data8(input + 8);
+ in[i + 16] = load_input_data8(input + 16);
+ in[i + 24] = load_input_data8(input + 24);
+ input += 32;
+ }
+}
+
+void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col[128], in[32];
+ int i, j;
+
+ // rows
+ for (i = 0; i < 4; ++i) {
+ load_buffer_8x32(input, in);
+ input += 32 << 3;
// Transpose 32x8 block to 8x32 block
transpose_16bit_8x8(in, in);
transpose_16bit_8x8(in + 16, in + 16);
transpose_16bit_8x8(in + 24, in + 24);
- IDCT32
-
- // 1_D: Store 32 intermediate results for each 8x32 block.
- col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
- col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
- col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
- col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
- col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
- col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
- col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
- col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
- col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
- col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
- col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
- col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
- col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
- col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
- col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
- col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
- col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
- col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
- col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
- col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
- col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
- col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
- col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
- col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
- col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
- col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
- col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
- col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
- col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
- col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
- col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
- col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+ idct32_full_8x32(in, col + (i << 5));
}
- for (i = 0; i < 4; i++) {
- // Second 1-D idct
- j = i << 3;
+ // columns
+ for (i = 0; i < 4; ++i) {
+ j = i << 3;
// Transpose 32x8 block to 8x32 block
transpose_16bit_8x8(col + j, in);
transpose_16bit_8x8(col + j + 32, in + 8);
transpose_16bit_8x8(col + j + 64, in + 16);
transpose_16bit_8x8(col + j + 96, in + 24);
- IDCT32
-
- // 2_D: Calculate the results and store them to destination.
- in[0] = _mm_add_epi16(stp1_0, stp1_31);
- in[1] = _mm_add_epi16(stp1_1, stp1_30);
- in[2] = _mm_add_epi16(stp1_2, stp1_29);
- in[3] = _mm_add_epi16(stp1_3, stp1_28);
- in[4] = _mm_add_epi16(stp1_4, stp1_27);
- in[5] = _mm_add_epi16(stp1_5, stp1_26);
- in[6] = _mm_add_epi16(stp1_6, stp1_25);
- in[7] = _mm_add_epi16(stp1_7, stp1_24);
- in[8] = _mm_add_epi16(stp1_8, stp1_23);
- in[9] = _mm_add_epi16(stp1_9, stp1_22);
- in[10] = _mm_add_epi16(stp1_10, stp1_21);
- in[11] = _mm_add_epi16(stp1_11, stp1_20);
- in[12] = _mm_add_epi16(stp1_12, stp1_19);
- in[13] = _mm_add_epi16(stp1_13, stp1_18);
- in[14] = _mm_add_epi16(stp1_14, stp1_17);
- in[15] = _mm_add_epi16(stp1_15, stp1_16);
- in[16] = _mm_sub_epi16(stp1_15, stp1_16);
- in[17] = _mm_sub_epi16(stp1_14, stp1_17);
- in[18] = _mm_sub_epi16(stp1_13, stp1_18);
- in[19] = _mm_sub_epi16(stp1_12, stp1_19);
- in[20] = _mm_sub_epi16(stp1_11, stp1_20);
- in[21] = _mm_sub_epi16(stp1_10, stp1_21);
- in[22] = _mm_sub_epi16(stp1_9, stp1_22);
- in[23] = _mm_sub_epi16(stp1_8, stp1_23);
- in[24] = _mm_sub_epi16(stp1_7, stp1_24);
- in[25] = _mm_sub_epi16(stp1_6, stp1_25);
- in[26] = _mm_sub_epi16(stp1_5, stp1_26);
- in[27] = _mm_sub_epi16(stp1_4, stp1_27);
- in[28] = _mm_sub_epi16(stp1_3, stp1_28);
- in[29] = _mm_sub_epi16(stp1_2, stp1_29);
- in[30] = _mm_sub_epi16(stp1_1, stp1_30);
- in[31] = _mm_sub_epi16(stp1_0, stp1_31);
-
- for (j = 0; j < 32; ++j) {
- // Final rounding and shift
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j] = _mm_srai_epi16(in[j], 6);
- recon_and_store(dest + j * stride, in[j]);
- }
-
+ idct32_full_8x32(in, in);
+ store_buffer_8x32(in, dest, stride);
dest += 8;
}
}
void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
__m128i dc_value;
- int a, j;
-
- a = (int)dct_const_round_shift(input[0] * cospi_16_64);
- a = (int)dct_const_round_shift(a * cospi_16_64);
- a = ROUND_POWER_OF_TWO(a, 6);
+ int j;
+ tran_high_t a1;
+ tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- dc_value = _mm_set1_epi16(a);
+ out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+ a1 = ROUND_POWER_OF_TWO(out, 6);
+ dc_value = _mm_set1_epi16(a1);
for (j = 0; j < 32; ++j) {
- recon_and_store(dest + 0 + j * stride, dc_value);
- recon_and_store(dest + 8 + j * stride, dc_value);
- recon_and_store(dest + 16 + j * stride, dc_value);
- recon_and_store(dest + 24 + j * stride, dc_value);
+ recon_and_store_16(dest + j * stride + 0, dc_value);
+ recon_and_store_16(dest + j * stride + 16, dc_value);
}
}
return _mm_packs_epi32(t0, t1);
}
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
+static INLINE void multiplication_and_add_2(const __m128i *const in0,
+ const __m128i *const in1,
+ const __m128i *const cst0,
+ const __m128i *const cst1,
+ __m128i *const res0,
+ __m128i *const res1) {
+ const __m128i lo = _mm_unpacklo_epi16(*in0, *in1);
+ const __m128i hi = _mm_unpackhi_epi16(*in0, *in1);
+ *res0 = idct_calc_wraplow_sse2(lo, hi, *cst0);
+ *res1 = idct_calc_wraplow_sse2(lo, hi, *cst1);
+}
+
+// Functions to allow 8 bit optimisations to be used when profile 0 is used with
// highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
+static INLINE __m128i load_input_data4(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i in = _mm_load_si128((const __m128i *)data);
+ return _mm_packs_epi32(in, zero);
+#else
+ return _mm_loadl_epi64((const __m128i *)data);
+#endif
+}
+
+static INLINE __m128i load_input_data8(const tran_low_t *data) {
#if CONFIG_VP9_HIGHBITDEPTH
- // in0: 0 X 1 X 2 X 3 X
- // in1: 4 X 5 X 6 X 7 X
- // t0: 0 4 X X 1 5 X X
- // t1: 2 6 X X 3 7 X X
- // t2: 0 2 4 6 X X X X
- // t3: 1 3 5 7 X X X X
- // rtn: 0 1 2 3 4 5 6 7
const __m128i in0 = _mm_load_si128((const __m128i *)data);
const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4));
- const __m128i t0 = _mm_unpacklo_epi16(in0, in1);
- const __m128i t1 = _mm_unpackhi_epi16(in0, in1);
- const __m128i t2 = _mm_unpacklo_epi16(t0, t1);
- const __m128i t3 = _mm_unpackhi_epi16(t0, t1);
- return _mm_unpacklo_epi16(t2, t3);
+ return _mm_packs_epi32(in0, in1);
#else
return _mm_load_si128((const __m128i *)data);
#endif
static INLINE void load_buffer_8x8(const tran_low_t *const input,
__m128i *const in) {
- in[0] = load_input_data(input + 0 * 8);
- in[1] = load_input_data(input + 1 * 8);
- in[2] = load_input_data(input + 2 * 8);
- in[3] = load_input_data(input + 3 * 8);
- in[4] = load_input_data(input + 4 * 8);
- in[5] = load_input_data(input + 5 * 8);
- in[6] = load_input_data(input + 6 * 8);
- in[7] = load_input_data(input + 7 * 8);
+ in[0] = load_input_data8(input + 0 * 8);
+ in[1] = load_input_data8(input + 1 * 8);
+ in[2] = load_input_data8(input + 2 * 8);
+ in[3] = load_input_data8(input + 3 * 8);
+ in[4] = load_input_data8(input + 4 * 8);
+ in[5] = load_input_data8(input + 5 * 8);
+ in[6] = load_input_data8(input + 6 * 8);
+ in[7] = load_input_data8(input + 7 * 8);
}
static INLINE void load_buffer_8x16(const tran_low_t *const input,
__m128i *const in) {
- in[0] = load_input_data(input + 0 * 16);
- in[1] = load_input_data(input + 1 * 16);
- in[2] = load_input_data(input + 2 * 16);
- in[3] = load_input_data(input + 3 * 16);
- in[4] = load_input_data(input + 4 * 16);
- in[5] = load_input_data(input + 5 * 16);
- in[6] = load_input_data(input + 6 * 16);
- in[7] = load_input_data(input + 7 * 16);
-
- in[8] = load_input_data(input + 8 * 16);
- in[9] = load_input_data(input + 9 * 16);
- in[10] = load_input_data(input + 10 * 16);
- in[11] = load_input_data(input + 11 * 16);
- in[12] = load_input_data(input + 12 * 16);
- in[13] = load_input_data(input + 13 * 16);
- in[14] = load_input_data(input + 14 * 16);
- in[15] = load_input_data(input + 15 * 16);
+ in[0] = load_input_data8(input + 0 * 16);
+ in[1] = load_input_data8(input + 1 * 16);
+ in[2] = load_input_data8(input + 2 * 16);
+ in[3] = load_input_data8(input + 3 * 16);
+ in[4] = load_input_data8(input + 4 * 16);
+ in[5] = load_input_data8(input + 5 * 16);
+ in[6] = load_input_data8(input + 6 * 16);
+ in[7] = load_input_data8(input + 7 * 16);
+
+ in[8] = load_input_data8(input + 8 * 16);
+ in[9] = load_input_data8(input + 9 * 16);
+ in[10] = load_input_data8(input + 10 * 16);
+ in[11] = load_input_data8(input + 11 * 16);
+ in[12] = load_input_data8(input + 12 * 16);
+ in[13] = load_input_data8(input + 13 * 16);
+ in[14] = load_input_data8(input + 14 * 16);
+ in[15] = load_input_data8(input + 15 * 16);
}
static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) {
*(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
}
+static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ int j = 0;
+ while (j < 32) {
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
+
+ in[j] = _mm_srai_epi16(in[j], 6);
+ in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
+
+ recon_and_store(dst, in[j]);
+ dst += stride;
+ recon_and_store(dst, in[j + 1]);
+ dst += stride;
+ j += 2;
+ }
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
+ int size) {
+ int i = 0;
+ const int num = size >> 1;
+ const int bound = size - 1;
+ while (i < num) {
+ out[i] = _mm_add_epi16(in[i], in[bound - i]);
+ out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
+ i++;
+ }
+}
+
+#define BUTTERFLY_PAIR(x0, x1, co0, co1) \
+ do { \
+ tmp0 = _mm_madd_epi16(x0, co0); \
+ tmp1 = _mm_madd_epi16(x1, co0); \
+ tmp2 = _mm_madd_epi16(x0, co1); \
+ tmp3 = _mm_madd_epi16(x1, co1); \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ } while (0)
+
+static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
+ const __m128i *c0, const __m128i *c1, __m128i *y0,
+ __m128i *y1) {
+ __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ u0 = _mm_unpacklo_epi16(*x0, *x1);
+ u1 = _mm_unpackhi_epi16(*x0, *x1);
+ BUTTERFLY_PAIR(u0, u1, *c0, *c1);
+ *y0 = _mm_packs_epi32(tmp0, tmp1);
+ *y1 = _mm_packs_epi32(tmp2, tmp3);
+}
+
+static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
+ const __m128i *c1) {
+ __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+ u0 = _mm_unpacklo_epi16(*x0, *x1);
+ u1 = _mm_unpackhi_epi16(*x0, *x1);
+ BUTTERFLY_PAIR(u0, u1, *c0, *c1);
+ *x0 = _mm_packs_epi32(tmp0, tmp1);
+ *x1 = _mm_packs_epi32(tmp2, tmp3);
+}
+
void idct4_sse2(__m128i *in);
void idct8_sse2(__m128i *in);
void idct16_sse2(__m128i *in0, __m128i *in1);
void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
int stride) {
- const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
- const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
- const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
- const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
- const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
- const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
- const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
- const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
- __m128i in[8];
- __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
- __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp[4];
-
- // Rows. Load 4-row input data.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 8 * 1);
- in[2] = load_input_data(input + 8 * 2);
- in[3] = load_input_data(input + 8 * 3);
-
- // 4x4 Transpose
- transpose_16bit_4x4(in, in);
-
- // Stage1
- tmp[0] = _mm_mulhrs_epi16(in[0], stg1_0);
- tmp[1] = _mm_mulhrs_epi16(in[0], stg1_1);
- tmp[2] = _mm_mulhrs_epi16(in[1], stg1_2);
- tmp[3] = _mm_mulhrs_epi16(in[1], stg1_3);
-
- stp1_4 = _mm_unpackhi_epi64(tmp[0], tmp[1]);
- stp1_5 = _mm_unpackhi_epi64(tmp[2], tmp[3]);
-
- // Stage2
- tmp[0] = _mm_mulhrs_epi16(in[0], stg2_0);
- stp2_0 = _mm_unpacklo_epi64(tmp[0], tmp[0]);
-
- tmp[1] = _mm_mulhrs_epi16(in[1], stg2_2);
- tmp[2] = _mm_mulhrs_epi16(in[1], stg2_3);
- stp2_2 = _mm_unpacklo_epi64(tmp[2], tmp[1]);
-
- tmp[0] = _mm_add_epi16(stp1_4, stp1_5);
- tmp[1] = _mm_sub_epi16(stp1_4, stp1_5);
-
- stp2_4 = tmp[0];
- stp2_5 = _mm_unpacklo_epi64(tmp[1], zero);
- stp2_6 = _mm_unpackhi_epi64(tmp[1], zero);
+ const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
+ const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
+ const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
+ const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i cospi_16_64d = _mm_set1_epi16(2 * cospi_16_64);
+ const __m128i cospi_28_64d = _mm_set1_epi16(2 * cospi_28_64);
+ const __m128i cospi_4_64d = _mm_set1_epi16(2 * cospi_4_64);
+ const __m128i cospi_n20_64d = _mm_set1_epi16(-2 * cospi_20_64);
+ const __m128i cospi_12_64d = _mm_set1_epi16(2 * cospi_12_64);
+ const __m128i cospi_24_64d = _mm_set1_epi16(2 * cospi_24_64);
+ const __m128i cospi_8_64d = _mm_set1_epi16(2 * cospi_8_64);
+ __m128i in[8], step1[8], step2[8], tmp[4];
+
+ in[0] = load_input_data4(input + 0 * 8);
+ in[1] = load_input_data4(input + 1 * 8);
+ in[2] = load_input_data4(input + 2 * 8);
+ in[3] = load_input_data4(input + 3 * 8);
+
+ // pass 1
- tmp[0] = _mm_unpacklo_epi16(stp2_5, stp2_6);
- tmp[1] = _mm_madd_epi16(tmp[0], stg3_0);
- tmp[2] = _mm_madd_epi16(tmp[0], stk2_0); // stg3_1 = stk2_0
-
- tmp[1] = _mm_add_epi32(tmp[1], rounding);
- tmp[2] = _mm_add_epi32(tmp[2], rounding);
- tmp[1] = _mm_srai_epi32(tmp[1], DCT_CONST_BITS);
- tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS);
-
- stp1_5 = _mm_packs_epi32(tmp[1], tmp[2]);
-
- // Stage3
- tmp[2] = _mm_add_epi16(stp2_0, stp2_2);
- tmp[3] = _mm_sub_epi16(stp2_0, stp2_2);
-
- stp1_2 = _mm_unpackhi_epi64(tmp[3], tmp[2]);
- stp1_3 = _mm_unpacklo_epi64(tmp[3], tmp[2]);
-
- // Stage4
- tmp[0] = _mm_add_epi16(stp1_3, stp2_4);
- tmp[1] = _mm_add_epi16(stp1_2, stp1_5);
- tmp[2] = _mm_sub_epi16(stp1_3, stp2_4);
- tmp[3] = _mm_sub_epi16(stp1_2, stp1_5);
+ transpose_16bit_4x4(in, in);
+ // in[0]: 00 10 20 30 01 11 21 31
+ // in[1]: 02 12 22 32 03 13 23 33
+
+ // stage 1
+ tmp[0] = _mm_unpacklo_epi64(in[0], in[0]);
+ tmp[1] = _mm_unpackhi_epi64(in[0], in[0]);
+ tmp[2] = _mm_unpacklo_epi64(in[1], in[1]);
+ tmp[3] = _mm_unpackhi_epi64(in[1], in[1]);
+ step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d); // step1 4&7
+ step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d); // step1 5&6
+
+ // stage 2
+ step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d); // step2 0&1
+ step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d); // step2 3&2
+ step2[4] = _mm_add_epi16(step1[4], step1[5]); // step2 4&7
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]); // step2 5&6
+ step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]); // step2 6
+
+ // stage 3
+ tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
+ step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]); // step1 5&6
+ tmp[0] = _mm_add_epi16(step2[0], step2[2]); // step1 0&1
+ tmp[1] = _mm_sub_epi16(step2[0], step2[2]); // step1 3&2
+ step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]); // step1 2&1
+ step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]); // step1 3&0
+
+ // stage 4
+ tmp[0] = _mm_add_epi16(step1[3], step2[4]); // output 3&0
+ tmp[1] = _mm_add_epi16(step1[2], step1[5]); // output 2&1
+ tmp[2] = _mm_sub_epi16(step1[3], step2[4]); // output 4&7
+ tmp[3] = _mm_sub_epi16(step1[2], step1[5]); // output 5&6
+
+ // pass 2
idct8x8_12_transpose_16bit_4x8(tmp, in);
- /* Stage1 */
- stp1_4 = _mm_mulhrs_epi16(in[1], stg1_0);
- stp1_7 = _mm_mulhrs_epi16(in[1], stg1_1);
- stp1_5 = _mm_mulhrs_epi16(in[3], stg1_2);
- stp1_6 = _mm_mulhrs_epi16(in[3], stg1_3);
-
- /* Stage2 */
- stp2_0 = _mm_mulhrs_epi16(in[0], stg2_0);
- stp2_1 = _mm_mulhrs_epi16(in[0], stg2_0);
-
- stp2_2 = _mm_mulhrs_epi16(in[2], stg2_2);
- stp2_3 = _mm_mulhrs_epi16(in[2], stg2_3);
-
- stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
- stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
- stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
- stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
- /* Stage3 */
- stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
- stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
- stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
- stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
- tmp[0] = _mm_unpacklo_epi16(stp2_6, stp2_5);
- tmp[1] = _mm_unpackhi_epi16(stp2_6, stp2_5);
-
- tmp[2] = _mm_madd_epi16(tmp[0], stk2_0);
- tmp[3] = _mm_madd_epi16(tmp[1], stk2_0);
- tmp[2] = _mm_add_epi32(tmp[2], rounding);
- tmp[3] = _mm_add_epi32(tmp[3], rounding);
- tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS);
- tmp[3] = _mm_srai_epi32(tmp[3], DCT_CONST_BITS);
- stp1_6 = _mm_packs_epi32(tmp[2], tmp[3]);
-
- tmp[2] = _mm_madd_epi16(tmp[0], stk2_1);
- tmp[3] = _mm_madd_epi16(tmp[1], stk2_1);
- tmp[2] = _mm_add_epi32(tmp[2], rounding);
- tmp[3] = _mm_add_epi32(tmp[3], rounding);
- tmp[2] = _mm_srai_epi32(tmp[2], DCT_CONST_BITS);
- tmp[3] = _mm_srai_epi32(tmp[3], DCT_CONST_BITS);
- stp1_5 = _mm_packs_epi32(tmp[2], tmp[3]);
-
- /* Stage4 */
- in[0] = _mm_add_epi16(stp1_0, stp2_7);
- in[1] = _mm_add_epi16(stp1_1, stp1_6);
- in[2] = _mm_add_epi16(stp1_2, stp1_5);
- in[3] = _mm_add_epi16(stp1_3, stp2_4);
- in[4] = _mm_sub_epi16(stp1_3, stp2_4);
- in[5] = _mm_sub_epi16(stp1_2, stp1_5);
- in[6] = _mm_sub_epi16(stp1_1, stp1_6);
- in[7] = _mm_sub_epi16(stp1_0, stp2_7);
+ // stage 1
+ step1[4] = _mm_mulhrs_epi16(in[1], cospi_28_64d);
+ step1[7] = _mm_mulhrs_epi16(in[1], cospi_4_64d);
+ step1[5] = _mm_mulhrs_epi16(in[3], cospi_n20_64d);
+ step1[6] = _mm_mulhrs_epi16(in[3], cospi_12_64d);
+
+ // stage 2
+ step2[0] = _mm_mulhrs_epi16(in[0], cospi_16_64d); // step2[1] = step2[0]
+ step2[2] = _mm_mulhrs_epi16(in[2], cospi_24_64d);
+ step2[3] = _mm_mulhrs_epi16(in[2], cospi_8_64d);
+ step2[4] = _mm_add_epi16(step1[4], step1[5]);
+ step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+ step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+ step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+ // stage 3
+ step1[0] = _mm_add_epi16(step2[0], step2[3]);
+ step1[1] = _mm_add_epi16(step2[0], step2[2]);
+ step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+ step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+ multiplication_and_add_2(&step2[6], &step2[5], &cp_16_n16, &cp_16_16,
+ &step1[5], &step1[6]);
+
+ // stage 4
+ in[0] = _mm_add_epi16(step1[0], step2[7]);
+ in[1] = _mm_add_epi16(step1[1], step1[6]);
+ in[2] = _mm_add_epi16(step1[2], step1[5]);
+ in[3] = _mm_add_epi16(step1[3], step2[4]);
+ in[4] = _mm_sub_epi16(step1[3], step2[4]);
+ in[5] = _mm_sub_epi16(step1[2], step1[5]);
+ in[6] = _mm_sub_epi16(step1[1], step1[6]);
+ in[7] = _mm_sub_epi16(step1[0], step2[7]);
write_buffer_8x8(in, dest, stride);
}
-// Only do addition and subtraction butterfly, size = 16, 32
-static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
- int size) {
- int i = 0;
- const int num = size >> 1;
- const int bound = size - 1;
- while (i < num) {
- out[i] = _mm_add_epi16(in[i], in[bound - i]);
- out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
- i++;
- }
-}
-
-#define BUTTERFLY_PAIR(x0, x1, co0, co1) \
- do { \
- tmp0 = _mm_madd_epi16(x0, co0); \
- tmp1 = _mm_madd_epi16(x1, co0); \
- tmp2 = _mm_madd_epi16(x0, co1); \
- tmp3 = _mm_madd_epi16(x1, co1); \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- } while (0)
-
-static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
- const __m128i *c0, const __m128i *c1, __m128i *y0,
- __m128i *y1) {
- __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- u0 = _mm_unpacklo_epi16(*x0, *x1);
- u1 = _mm_unpackhi_epi16(*x0, *x1);
- BUTTERFLY_PAIR(u0, u1, *c0, *c1);
- *y0 = _mm_packs_epi32(tmp0, tmp1);
- *y1 = _mm_packs_epi32(tmp2, tmp3);
-}
-
-static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
- const __m128i *c1) {
- __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- u0 = _mm_unpacklo_epi16(*x0, *x1);
- u1 = _mm_unpackhi_epi16(*x0, *x1);
- BUTTERFLY_PAIR(u0, u1, *c0, *c1);
- *x0 = _mm_packs_epi32(tmp0, tmp1);
- *x1 = _mm_packs_epi32(tmp2, tmp3);
-}
-
static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
int i;
// Load input data. Only need to load the top left 8x8 block.
- in[0] = load_input_data(input);
- in[1] = load_input_data(input + 32);
- in[2] = load_input_data(input + 64);
- in[3] = load_input_data(input + 96);
- in[4] = load_input_data(input + 128);
- in[5] = load_input_data(input + 160);
- in[6] = load_input_data(input + 192);
- in[7] = load_input_data(input + 224);
+ in[0] = load_input_data8(input + 0 * 32);
+ in[1] = load_input_data8(input + 1 * 32);
+ in[2] = load_input_data8(input + 2 * 32);
+ in[3] = load_input_data8(input + 3 * 32);
+ in[4] = load_input_data8(input + 4 * 32);
+ in[5] = load_input_data8(input + 5 * 32);
+ in[6] = load_input_data8(input + 6 * 32);
+ in[7] = load_input_data8(input + 7 * 32);
transpose_16bit_8x8(in, in);
idct32_34_first_half(in, stp1);
__m128i *in1) {
int i;
for (i = 0; i < 16; i++) {
- in0[i] = load_input_data(input);
- in1[i] = load_input_data(input + 8);
+ in0[i] = load_input_data8(input);
+ in1[i] = load_input_data8(input + 8);
input += 32;
}
}
add_sub_butterfly(out, in, 32);
}
-static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
- const __m128i final_rounding = _mm_set1_epi16(1 << 5);
- int j = 0;
- while (j < 32) {
- in[j] = _mm_adds_epi16(in[j], final_rounding);
- in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
-
- in[j] = _mm_srai_epi16(in[j], 6);
- in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
-
- recon_and_store(dst, in[j]);
- dst += stride;
- recon_and_store(dst, in[j + 1]);
- dst += stride;
- j += 2;
- }
-}
-
static INLINE void recon_and_store_ssse3(__m128i *in0, __m128i *in1,
uint8_t *dest, int stride) {
store_buffer_8x32(in0, dest, stride);
idct32_135(col0, col1);
recon_and_store_ssse3(col0, col1, dest + 16, stride);
}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
-// output pixels: 8-15 in __m128i in[32]
-static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
- __m128i *out /*out[16]*/) {
- __m128i u8, u9, u10, u11, u12, u13, u14, u15; // stp2_
- __m128i v8, v9, v10, v11, v12, v13, v14, v15; // stp1_
-
- {
- const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
- butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
- butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
- }
-
- v8 = _mm_add_epi16(u8, u9);
- v9 = _mm_sub_epi16(u8, u9);
- v14 = _mm_sub_epi16(u15, u14);
- v15 = _mm_add_epi16(u15, u14);
-
- {
- const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
- butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
- butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
- }
-
- v10 = _mm_sub_epi16(u11, u10);
- v11 = _mm_add_epi16(u11, u10);
- v12 = _mm_add_epi16(u12, u13);
- v13 = _mm_sub_epi16(u12, u13);
-
- {
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
- butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
- }
-
- out[0] = _mm_add_epi16(v8, v11);
- out[1] = _mm_add_epi16(v9, v10);
- out[6] = _mm_add_epi16(v14, v13);
- out[7] = _mm_add_epi16(v15, v12);
-
- out[2] = _mm_sub_epi16(v9, v10);
- out[3] = _mm_sub_epi16(v8, v11);
- out[4] = _mm_sub_epi16(v15, v12);
- out[5] = _mm_sub_epi16(v14, v13);
-
- {
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
- butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
- }
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
-// output pixels: 0-7 in __m128i in[32]
-static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
- __m128i *out /*out[8]*/) {
- __m128i u0, u1, u2, u3, u4, u5, u6, u7; // stp1_
- __m128i v0, v1, v2, v3, v4, v5, v6, v7; // stp2_
-
- {
- const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
- butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
- butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
- }
-
- v4 = _mm_add_epi16(u4, u5);
- v5 = _mm_sub_epi16(u4, u5);
- v6 = _mm_sub_epi16(u7, u6);
- v7 = _mm_add_epi16(u7, u6);
-
- {
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
- butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-
- butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
- butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
- }
-
- v0 = _mm_add_epi16(u0, u3);
- v1 = _mm_add_epi16(u1, u2);
- v2 = _mm_sub_epi16(u1, u2);
- v3 = _mm_sub_epi16(u0, u3);
-
- out[0] = _mm_add_epi16(v0, v7);
- out[1] = _mm_add_epi16(v1, v6);
- out[2] = _mm_add_epi16(v2, v5);
- out[3] = _mm_add_epi16(v3, v4);
- out[4] = _mm_sub_epi16(v3, v4);
- out[5] = _mm_sub_epi16(v2, v5);
- out[6] = _mm_sub_epi16(v1, v6);
- out[7] = _mm_sub_epi16(v0, v7);
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with odd index,
-// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-// output pixels: 16-23, 24-31 in __m128i in[32]
-// We avoid hide an offset, 16, inside this function. So we output 0-15 into
-// array out[16]
-static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
- __m128i *out /*out[16]*/) {
- __m128i v16, v17, v18, v19, v20, v21, v22, v23;
- __m128i v24, v25, v26, v27, v28, v29, v30, v31;
- __m128i u16, u17, u18, u19, u20, u21, u22, u23;
- __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
- {
- const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
- butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
- butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
- butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
- butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
-
- butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
- butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
-
- butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
- butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
- }
-
- v16 = _mm_add_epi16(u16, u17);
- v17 = _mm_sub_epi16(u16, u17);
- v18 = _mm_sub_epi16(u19, u18);
- v19 = _mm_add_epi16(u19, u18);
-
- v20 = _mm_add_epi16(u20, u21);
- v21 = _mm_sub_epi16(u20, u21);
- v22 = _mm_sub_epi16(u23, u22);
- v23 = _mm_add_epi16(u23, u22);
-
- v24 = _mm_add_epi16(u24, u25);
- v25 = _mm_sub_epi16(u24, u25);
- v26 = _mm_sub_epi16(u27, u26);
- v27 = _mm_add_epi16(u27, u26);
-
- v28 = _mm_add_epi16(u28, u29);
- v29 = _mm_sub_epi16(u28, u29);
- v30 = _mm_sub_epi16(u31, u30);
- v31 = _mm_add_epi16(u31, u30);
-
- {
- const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
- const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
- butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
- butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
- butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
- butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
- }
-
- u16 = _mm_add_epi16(v16, v19);
- u17 = _mm_add_epi16(v17, v18);
- u18 = _mm_sub_epi16(v17, v18);
- u19 = _mm_sub_epi16(v16, v19);
- u20 = _mm_sub_epi16(v23, v20);
- u21 = _mm_sub_epi16(v22, v21);
- u22 = _mm_add_epi16(v22, v21);
- u23 = _mm_add_epi16(v23, v20);
-
- u24 = _mm_add_epi16(v24, v27);
- u25 = _mm_add_epi16(v25, v26);
- u26 = _mm_sub_epi16(v25, v26);
- u27 = _mm_sub_epi16(v24, v27);
-
- u28 = _mm_sub_epi16(v31, v28);
- u29 = _mm_sub_epi16(v30, v29);
- u30 = _mm_add_epi16(v29, v30);
- u31 = _mm_add_epi16(v28, v31);
-
- {
- const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
- butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
- butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
- butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
- }
-
- out[0] = _mm_add_epi16(u16, u23);
- out[1] = _mm_add_epi16(u17, u22);
- out[2] = _mm_add_epi16(u18, u21);
- out[3] = _mm_add_epi16(u19, u20);
- out[4] = _mm_sub_epi16(u19, u20);
- out[5] = _mm_sub_epi16(u18, u21);
- out[6] = _mm_sub_epi16(u17, u22);
- out[7] = _mm_sub_epi16(u16, u23);
-
- out[8] = _mm_sub_epi16(u31, u24);
- out[9] = _mm_sub_epi16(u30, u25);
- out[10] = _mm_sub_epi16(u29, u26);
- out[11] = _mm_sub_epi16(u28, u27);
- out[12] = _mm_add_epi16(u27, u28);
- out[13] = _mm_add_epi16(u26, u29);
- out[14] = _mm_add_epi16(u25, u30);
- out[15] = _mm_add_epi16(u24, u31);
-
- {
- const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
- const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
- butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
- butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
- butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
- }
-}
-
-static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
- __m128i *out /*out[32]*/) {
- __m128i temp[16];
- idct32_full_8x32_quarter_1(in, temp);
- idct32_full_8x32_quarter_2(in, &temp[8]);
- add_sub_butterfly(temp, out, 16);
-}
-
-static void idct32_full_8x32(const __m128i *in /*in[32]*/,
- __m128i *out /*out[32]*/) {
- __m128i temp[32];
- idct32_full_8x32_quarter_1_2(in, temp);
- idct32_full_8x32_quarter_3_4(in, &temp[16]);
- add_sub_butterfly(temp, out, 32);
-}
-
-static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
- int i;
- for (i = 0; i < 8; ++i) {
- in[i] = load_input_data(input);
- in[i + 8] = load_input_data(input + 8);
- in[i + 16] = load_input_data(input + 16);
- in[i + 24] = load_input_data(input + 24);
- input += 32;
- }
-}
-
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
- int stride) {
- __m128i col[128], in[32];
- int i, j;
-
- // rows
- for (i = 0; i < 4; ++i) {
- load_buffer_8x32(input, in);
- input += 32 << 3;
-
- // Transpose 32x8 block to 8x32 block
- transpose_16bit_8x8(in, in);
- transpose_16bit_8x8(in + 8, in + 8);
- transpose_16bit_8x8(in + 16, in + 16);
- transpose_16bit_8x8(in + 24, in + 24);
-
- idct32_full_8x32(in, col + (i << 5));
- }
-
- // columns
- for (i = 0; i < 4; ++i) {
- j = i << 3;
- // Transpose 32x8 block to 8x32 block
- transpose_16bit_8x8(col + j, in);
- transpose_16bit_8x8(col + j + 32, in + 8);
- transpose_16bit_8x8(col + j + 64, in + 16);
- transpose_16bit_8x8(col + j + 96, in + 24);
-
- idct32_full_8x32(in, in);
- store_buffer_8x32(in, dest, stride);
- dest += 8;
- }
-}