armv7-win32-vs11
armv7-win32-vs12
armv7-win32-vs14
+ armv7-win32-vs15
armv7s-darwin-gcc
armv8-linux-gcc
mips32-linux-gcc
x86-win32-vs11
x86-win32-vs12
x86-win32-vs14
+ x86-win32-vs15
x86_64-android-gcc
x86_64-darwin9-gcc
x86_64-darwin10-gcc
x86_64-win64-vs11
x86_64-win64-vs12
x86_64-win64-vs14
+ x86_64-win64-vs15
generic-gnu
The generic-gnu target, in conjunction with the CROSS environment variable,
Options:
--help Print this message
--out=outfile Redirect output to a file
- --ver=version Version (7,8,9,10,11,12,14) of visual studio to generate for
+ --ver=version Version (7,8,9,10,11,12,14,15) of visual studio to generate for
--target=isa-os-cc Target specifier
EOF
exit 1
;;
--ver=*) vs_ver="$optval"
case $optval in
- 10|11|12|14)
+ 10|11|12|14|15)
;;
*) die Unrecognized Visual Studio Version in $opt
;;
14) sln_vers="14.00"
sln_vers_str="Visual Studio 2015"
;;
+ 15) sln_vers="15.00"
+ sln_vers_str="Visual Studio 2017"
+ ;;
esac
sfx=vcxproj
--name=project_name Name of the project (required)
--proj-guid=GUID GUID to use for the project
--module-def=filename File containing export definitions (for DLLs)
- --ver=version Version (10,11,12,14) of visual studio to generate for
+ --ver=version Version (10,11,12,14,15) of visual studio to generate for
--src-path-bare=dir Path to root of source tree
-Ipath/to/include Additional include directories
-DFLAG[=value] Preprocessor macros to define
--ver=*)
vs_ver="$optval"
case "$optval" in
- 10|11|12|14)
+ 10|11|12|14|15)
;;
*) die Unrecognized Visual Studio Version in $opt
;;
asm_use_custom_step=false
uses_asm=${uses_asm:-false}
case "${vs_ver:-11}" in
- 10|11|12|14)
+ 10|11|12|14|15)
asm_use_custom_step=$uses_asm
;;
esac
if [ "$vs_ver" = "14" ]; then
tag_content PlatformToolset v140
fi
+ if [ "$vs_ver" = "15" ]; then
+ tag_content PlatformToolset v141
+ fi
tag_content CharacterSet Unicode
if [ "$config" = "Release" ]; then
tag_content WholeProgramOptimization true
all_platforms="${all_platforms} armv7-win32-vs11"
all_platforms="${all_platforms} armv7-win32-vs12"
all_platforms="${all_platforms} armv7-win32-vs14"
+all_platforms="${all_platforms} armv7-win32-vs15"
all_platforms="${all_platforms} armv7s-darwin-gcc"
all_platforms="${all_platforms} armv8-linux-gcc"
all_platforms="${all_platforms} mips32-linux-gcc"
all_platforms="${all_platforms} x86-win32-vs11"
all_platforms="${all_platforms} x86-win32-vs12"
all_platforms="${all_platforms} x86-win32-vs14"
+all_platforms="${all_platforms} x86-win32-vs15"
all_platforms="${all_platforms} x86_64-android-gcc"
all_platforms="${all_platforms} x86_64-darwin9-gcc"
all_platforms="${all_platforms} x86_64-darwin10-gcc"
all_platforms="${all_platforms} x86_64-win64-vs11"
all_platforms="${all_platforms} x86_64-win64-vs12"
all_platforms="${all_platforms} x86_64-win64-vs14"
+all_platforms="${all_platforms} x86_64-win64-vs15"
all_platforms="${all_platforms} generic-gnu"
# all_targets is a list of all targets that can be configured
int right_padding, int bottom_padding)
: width_(width), height_(height), top_padding_(top_padding),
left_padding_(left_padding), right_padding_(right_padding),
- bottom_padding_(bottom_padding) {
- Init();
- }
+ bottom_padding_(bottom_padding), raw_buffer_(NULL) {}
Buffer(int width, int height, int padding)
: width_(width), height_(height), top_padding_(padding),
left_padding_(padding), right_padding_(padding),
- bottom_padding_(padding) {
- Init();
- }
+ bottom_padding_(padding), raw_buffer_(NULL) {}
~Buffer() { delete[] raw_buffer_; }
int stride() const { return stride_; }
// Set the buffer (excluding padding) to 'value'.
- void Set(const int value);
+ void Set(const T value);
// Set the buffer (excluding padding) to the output of ACMRandom function 'b'.
void Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)());
bool HasPadding() const;
// Sets all the values in the buffer to 'padding_value'.
- void SetPadding(const int padding_value);
+ void SetPadding(const T padding_value);
// Checks if all the values (excluding padding) are equal to 'value' if the
// Buffers are the same size.
- bool CheckValues(const int value) const;
+ bool CheckValues(const T value) const;
// Check that padding matches the expected value or there is no padding.
bool CheckPadding() const;
// Compare the non-padding portion of two buffers if they are the same size.
bool CheckValues(const Buffer<T> &a) const;
- private:
- void Init() {
- ASSERT_GT(width_, 0);
- ASSERT_GT(height_, 0);
- ASSERT_GE(top_padding_, 0);
- ASSERT_GE(left_padding_, 0);
- ASSERT_GE(right_padding_, 0);
- ASSERT_GE(bottom_padding_, 0);
+ bool Init() {
+ EXPECT_GT(width_, 0);
+ EXPECT_GT(height_, 0);
+ EXPECT_GE(top_padding_, 0);
+ EXPECT_GE(left_padding_, 0);
+ EXPECT_GE(right_padding_, 0);
+ EXPECT_GE(bottom_padding_, 0);
stride_ = left_padding_ + width_ + right_padding_;
raw_size_ = stride_ * (top_padding_ + height_ + bottom_padding_);
raw_buffer_ = new (std::nothrow) T[raw_size_];
- ASSERT_TRUE(raw_buffer_ != NULL);
+ EXPECT_TRUE(raw_buffer_ != NULL);
SetPadding(std::numeric_limits<T>::max());
+ return !::testing::Test::HasFailure();
}
+ private:
bool BufferSizesMatch(const Buffer<T> &a) const;
const int width_;
const int left_padding_;
const int right_padding_;
const int bottom_padding_;
- int padding_value_;
+ T padding_value_;
int stride_;
int raw_size_;
T *raw_buffer_;
template <typename T>
T *Buffer<T>::TopLeftPixel() const {
+ if (!raw_buffer_) return NULL;
return raw_buffer_ + (top_padding_ * stride()) + left_padding_;
}
template <typename T>
-void Buffer<T>::Set(const int value) {
+void Buffer<T>::Set(const T value) {
+ if (!raw_buffer_) return;
T *src = TopLeftPixel();
for (int height = 0; height < height_; ++height) {
for (int width = 0; width < width_; ++width) {
template <typename T>
void Buffer<T>::Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()) {
+ if (!raw_buffer_) return;
T *src = TopLeftPixel();
for (int height = 0; height < height_; ++height) {
for (int width = 0; width < width_; ++width) {
template <typename T>
void Buffer<T>::CopyFrom(const Buffer<T> &a) {
- if (!BufferSizesMatch(a)) {
- return;
- }
+ if (!raw_buffer_) return;
+ if (!BufferSizesMatch(a)) return;
T *a_src = a.TopLeftPixel();
T *b_src = this->TopLeftPixel();
template <typename T>
void Buffer<T>::DumpBuffer() const {
+ if (!raw_buffer_) return;
for (int height = 0; height < height_ + top_padding_ + bottom_padding_;
++height) {
for (int width = 0; width < stride(); ++width) {
template <typename T>
bool Buffer<T>::HasPadding() const {
+ if (!raw_buffer_) return false;
return top_padding_ || left_padding_ || right_padding_ || bottom_padding_;
}
template <typename T>
void Buffer<T>::PrintDifference(const Buffer<T> &a) const {
- if (!BufferSizesMatch(a)) {
- return;
- }
+ if (!raw_buffer_) return;
+ if (!BufferSizesMatch(a)) return;
T *a_src = a.TopLeftPixel();
T *b_src = TopLeftPixel();
}
template <typename T>
-void Buffer<T>::SetPadding(const int padding_value) {
+void Buffer<T>::SetPadding(const T padding_value) {
+ if (!raw_buffer_) return;
padding_value_ = padding_value;
T *src = raw_buffer_;
}
template <typename T>
-bool Buffer<T>::CheckValues(const int value) const {
+bool Buffer<T>::CheckValues(const T value) const {
+ if (!raw_buffer_) return false;
T *src = TopLeftPixel();
for (int height = 0; height < height_; ++height) {
for (int width = 0; width < width_; ++width) {
template <typename T>
bool Buffer<T>::CheckPadding() const {
- if (!HasPadding()) {
- return true;
- }
+ if (!raw_buffer_) return false;
+ if (!HasPadding()) return true;
// Top padding.
T const *top = raw_buffer_;
template <typename T>
bool Buffer<T>::CheckValues(const Buffer<T> &a) const {
- if (!BufferSizesMatch(a)) {
- return false;
- }
+ if (!raw_buffer_) return false;
+ if (!BufferSizesMatch(a)) return false;
T *a_src = a.TopLeftPixel();
T *b_src = this->TopLeftPixel();
template <typename T>
bool Buffer<T>::BufferSizesMatch(const Buffer<T> &a) const {
+ if (!raw_buffer_) return false;
if (a.width_ != this->width_ || a.height_ != this->height_) {
printf(
"Reference buffer of size %dx%d does not match this buffer which is "
// Only the reference buffer may have a stride not equal to width.
Buffer<uint8_t> ref =
Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+ ASSERT_TRUE(ref.Init());
fill(&rnd_, pred, width, height);
ref.Set(&rnd_, &ACMRandom::Rand8);
const int width = 64;
const int height = 32;
Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, 8);
+ ASSERT_TRUE(ref.Init());
DECLARE_ALIGNED(16, uint8_t, pred[width * height]);
DECLARE_ALIGNED(16, uint8_t, avg_ref[width * height]);
DECLARE_ALIGNED(16, uint8_t, avg_chk[width * height]);
const int height = 1 << height_pow;
Buffer<uint8_t> ref =
Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+ ASSERT_TRUE(ref.Init());
fill(&rnd_, pred, width, height);
ref.Set(&rnd_, &ACMRandom::Rand8);
INSTANTIATE_TEST_CASE_P(SSE2, AvgPredTest,
::testing::Values(&vpx_comp_avg_pred_sse2));
#endif // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, AvgPredTest,
+ ::testing::Values(&vpx_comp_avg_pred_neon));
+#endif // HAVE_NEON
+
#if HAVE_VSX
INSTANTIATE_TEST_CASE_P(VSX, AvgPredTest,
::testing::Values(&vpx_comp_avg_pred_vsx));
30, 1, 0, 140);
const int kDropFrameThreshTestStep = 30;
- vpx_codec_pts_t last_drop = 140;
- int last_num_drops = 0;
- for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
- cfg_.rc_dropframe_thresh = i;
- ResetModel();
- ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
- ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
- << " The datarate for the file is lower than target by too much!";
- ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
- << " The datarate for the file is greater than target by too much!";
- ASSERT_LE(first_drop_, last_drop)
- << " The first dropped frame for drop_thresh " << i
- << " > first dropped frame for drop_thresh "
- << i - kDropFrameThreshTestStep;
- ASSERT_GE(num_drops_, last_num_drops * 0.85)
- << " The number of dropped frames for drop_thresh " << i
- << " < number of dropped frames for drop_thresh "
- << i - kDropFrameThreshTestStep;
- last_drop = first_drop_;
- last_num_drops = num_drops_;
+ for (int j = 50; j <= 150; j += 100) {
+ cfg_.rc_target_bitrate = j;
+ vpx_codec_pts_t last_drop = 140;
+ int last_num_drops = 0;
+ for (int i = 10; i < 100; i += kDropFrameThreshTestStep) {
+ cfg_.rc_dropframe_thresh = i;
+ ResetModel();
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+ ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+ << " The datarate for the file is lower than target by too much!";
+ ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+ << " The datarate for the file is greater than target by too much!";
+ ASSERT_LE(first_drop_, last_drop)
+ << " The first dropped frame for drop_thresh " << i
+ << " > first dropped frame for drop_thresh "
+ << i - kDropFrameThreshTestStep;
+ ASSERT_GE(num_drops_, last_num_drops * 0.85)
+ << " The number of dropped frames for drop_thresh " << i
+ << " < number of dropped frames for drop_thresh "
+ << i - kDropFrameThreshTestStep;
+ last_drop = first_drop_;
+ last_num_drops = num_drops_;
+ }
}
}
#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT,
- ::testing::Values(make_tuple(&vpx_fdct4x4_c,
+ ::testing::Values(make_tuple(&vpx_fdct4x4_neon,
&vpx_idct4x4_16_add_neon,
0, VPX_BITS_8)));
#if !CONFIG_VP9_HIGHBITDEPTH
UUT = GetParam();
input = new (std::nothrow) Buffer<int16_t>(4, 4, 0);
- ASSERT_TRUE(input != NULL);
+ ASSERT_TRUE(input->Init());
predict = new (std::nothrow) Buffer<uint8_t>(4, 4, 3);
- ASSERT_TRUE(predict != NULL);
+ ASSERT_TRUE(predict->Init());
output = new (std::nothrow) Buffer<uint8_t>(4, 4, 3);
- ASSERT_TRUE(output != NULL);
+ ASSERT_TRUE(output->Init());
}
virtual void TearDown() {
}
void InitInput() {
- const int max_coeff = 32766 / 4;
- int max_energy_leftover = max_coeff * max_coeff;
+ const int64_t max_coeff = (32766 << (bit_depth_ - 8)) / 4;
+ int64_t max_energy_leftover = max_coeff * max_coeff;
for (int j = 0; j < last_nonzero_; ++j) {
- int16_t coeff = static_cast<int16_t>(sqrt(1.0 * max_energy_leftover) *
- (rnd_.Rand16() - 32768) / 65536);
- max_energy_leftover -= coeff * coeff;
+ tran_low_t coeff = static_cast<tran_low_t>(
+ sqrt(1.0 * max_energy_leftover) * (rnd_.Rand16() - 32768) / 65536);
+ max_energy_leftover -= static_cast<int64_t>(coeff) * coeff;
if (max_energy_leftover < 0) {
max_energy_leftover = 0;
coeff = 0;
}
}
+ void PrintDiff() {
+ if (memcmp(output_block_ref_, output_block_,
+ pixel_size_ * output_block_size_)) {
+ uint16_t ref, opt;
+ for (int y = 0; y < size_; y++) {
+ for (int x = 0; x < size_; x++) {
+ if (pixel_size_ == 1) {
+ ref = output_block_ref_[y * stride_ + x];
+ opt = output_block_[y * stride_ + x];
+ } else {
+ ref = reinterpret_cast<uint16_t *>(
+ output_block_ref_)[y * stride_ + x];
+ opt = reinterpret_cast<uint16_t *>(output_block_)[y * stride_ + x];
+ }
+ if (ref != opt) {
+ printf("dest[%d][%d] diff:%6d (ref),%6d (opt)\n", y, x, ref, opt);
+ }
+ }
+ }
+
+ printf("\ninput_block_:\n");
+ for (int y = 0; y < size_; y++) {
+ for (int x = 0; x < size_; x++) {
+ printf("%6d,", input_block_[y * size_ + x]);
+ }
+ printf("\n");
+ }
+ }
+ }
+
protected:
int last_nonzero_;
TX_SIZE tx_size_;
};
TEST_P(PartialIDctTest, RunQuantCheck) {
+ const int count_test_block = (size_ != 4) ? kCountTestBlock : 65536;
DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxNumCoeffs]);
DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kMaxNumCoeffs]);
InitMem();
- for (int i = 0; i < kCountTestBlock; ++i) {
+
+ for (int i = 0; i < count_test_block; ++i) {
// Initialize a test block with input range [-mask_, mask_].
- if (i == 0) {
- for (int k = 0; k < input_block_size_; ++k) {
- input_extreme_block[k] = mask_;
- }
- } else if (i == 1) {
- for (int k = 0; k < input_block_size_; ++k) {
- input_extreme_block[k] = -mask_;
+ if (size_ != 4) {
+ if (i == 0) {
+ for (int k = 0; k < input_block_size_; ++k) {
+ input_extreme_block[k] = mask_;
+ }
+ } else if (i == 1) {
+ for (int k = 0; k < input_block_size_; ++k) {
+ input_extreme_block[k] = -mask_;
+ }
+ } else {
+ for (int k = 0; k < input_block_size_; ++k) {
+ input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_;
+ }
}
} else {
+ // Try all possible combinations.
for (int k = 0; k < input_block_size_; ++k) {
- input_extreme_block[k] = rnd_.Rand8() % 2 ? mask_ : -mask_;
+ input_extreme_block[k] = (i & (1 << k)) ? mask_ : -mask_;
}
}
vpx_usec_timer_mark(&timer);
const int elapsed_time =
static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
- printf("idct%dx%d_%d (bitdepth %d) time: %5d ms\n", size_, size_,
- last_nonzero_, bit_depth_, elapsed_time);
-
+ printf("idct%dx%d_%d (%s %d) time: %5d ms\n", size_, size_, last_nonzero_,
+ (pixel_size_ == 1) ? "bitdepth" : "high bitdepth", bit_depth_,
+ elapsed_time);
ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
pixel_size_ * output_block_size_))
<< "Error: partial inverse transform produces different results";
make_tuple(
&vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
&highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 12, 2),
+ make_tuple(
+ &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 8, 2),
+ make_tuple(
+ &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 10, 2),
+ make_tuple(
+ &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_1_add_c>,
+ &highbd_wrapper<vpx_highbd_idct16x16_1_add_sse2>, TX_16X16, 1, 12, 2),
make_tuple(&vpx_highbd_fdct8x8_c,
&highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
&highbd_wrapper<vpx_highbd_idct8x8_64_add_sse2>, TX_8X8, 64, 8, 2),
make_tuple(
&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
&highbd_wrapper<vpx_highbd_idct8x8_12_add_sse2>, TX_8X8, 12, 12, 2),
+ make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+ &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 8, 2),
+ make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+ &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 10, 2),
+ make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_1_add_c>,
+ &highbd_wrapper<vpx_highbd_idct8x8_1_add_sse2>, TX_8X8, 1, 12, 2),
make_tuple(&vpx_highbd_fdct4x4_c,
&highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
&highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 8, 2),
make_tuple(
&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
&highbd_wrapper<vpx_highbd_idct4x4_16_add_sse2>, TX_4X4, 16, 12, 2),
+ make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+ &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 8, 2),
+ make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+ &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 10, 2),
+ make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_1_add_c>,
+ &highbd_wrapper<vpx_highbd_idct4x4_1_add_sse2>, TX_4X4, 1, 12, 2),
#endif // CONFIG_VP9_HIGHBITDEPTH
make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
&wrapper<vpx_idct32x32_1024_add_sse2>, TX_32X32, 1024, 8, 1),
// 5-tap filter needs 2 padding rows above and below the block in the input.
Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2);
+ ASSERT_TRUE(src_image.Init());
// Filter extends output block by 8 samples at left and right edges.
// Though the left padding is only 8 bytes, the assembly code tries to
// read 16 bytes before the pointer.
Buffer<uint8_t> dst_image =
Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8);
+ ASSERT_TRUE(dst_image.Init());
uint8_t *const flimits =
reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
// SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
Buffer<uint8_t> src_image =
Buffer<uint8_t>(block_width, block_height, 2, 2, 10, 2);
+ ASSERT_TRUE(src_image.Init());
// Filter extends output block by 8 samples at left and right edges.
// Though the left padding is only 8 bytes, there is 'above' padding as well
// SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
Buffer<uint8_t> dst_image =
Buffer<uint8_t>(block_width, block_height, 8, 8, 16, 8);
+ ASSERT_TRUE(dst_image.Init());
Buffer<uint8_t> dst_image_ref = Buffer<uint8_t>(block_width, block_height, 8);
+ ASSERT_TRUE(dst_image_ref.Init());
// Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock
// can have a different filter. SSE2 assembly reads flimits in blocks of 16 so
const int cols = 16;
Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+ ASSERT_TRUE(src.Init());
src.SetPadding(10);
SetCols(src.TopLeftPixel(), rows, cols, src.stride());
Buffer<uint8_t> expected_output = Buffer<uint8_t>(cols, rows, 0);
+ ASSERT_TRUE(expected_output.Init());
SetCols(expected_output.TopLeftPixel(), rows, cols, expected_output.stride());
RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(0),
const int cols = 16;
Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+ ASSERT_TRUE(src.Init());
src.SetPadding(10);
SetCols(src.TopLeftPixel(), rows, cols, src.stride());
const int cols = 16;
Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+ ASSERT_TRUE(src.Init());
src.SetPadding(10);
SetCols(src.TopLeftPixel(), rows, cols, src.stride());
const int cols = 16;
Buffer<uint8_t> c_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+ ASSERT_TRUE(c_mem.Init());
Buffer<uint8_t> asm_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+ ASSERT_TRUE(asm_mem.Init());
// When level >= 100, the filter behaves the same as the level = INT_MAX
// When level < 20, it behaves the same as the level = 0
const int cols = 16;
Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+ ASSERT_TRUE(src_c.Init());
src_c.SetPadding(10);
SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
const int cols = 16;
Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+ ASSERT_TRUE(src_c.Init());
src_c.SetPadding(10);
SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
const int cols = 16;
Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+ ASSERT_TRUE(src_c.Init());
src_c.SetPadding(10);
SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
rnd.Reset(ACMRandom::DeterministicSeed());
Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+ ASSERT_TRUE(src_c.Init());
Buffer<uint8_t> src_asm = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+ ASSERT_TRUE(src_asm.Init());
for (int level = 0; level < 100; level++) {
src_c.SetPadding(10);
Buffer<unsigned int> *accumulator,
Buffer<uint16_t> *count) {
Buffer<int> diff_sq = Buffer<int>(w, h, 0);
+ ASSERT_TRUE(diff_sq.Init());
diff_sq.Set(0);
int rounding = 0;
// Depending on subsampling this function may be called with values of 8 or 16
// for width and height, in any combination.
Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
+ ASSERT_TRUE(a.Init());
const int filter_weight = 2;
const int filter_strength = 6;
for (int height = 8; height <= 16; height += 8) {
// The second buffer must not have any border.
Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+ ASSERT_TRUE(b.Init());
Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+ ASSERT_TRUE(accum_ref.Init());
Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+ ASSERT_TRUE(accum_chk.Init());
Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+ ASSERT_TRUE(count_ref.Init());
Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+ ASSERT_TRUE(count_chk.Init());
a.Set(&rnd_, &ACMRandom::Rand8);
b.Set(&rnd_, &ACMRandom::Rand8);
for (int width = 8; width <= 16; width += 8) {
for (int height = 8; height <= 16; height += 8) {
Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
+ ASSERT_TRUE(a.Init());
// The second buffer must not have any border.
Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+ ASSERT_TRUE(b.Init());
Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+ ASSERT_TRUE(accum_ref.Init());
Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+ ASSERT_TRUE(accum_chk.Init());
Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+ ASSERT_TRUE(count_ref.Init());
Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+ ASSERT_TRUE(count_chk.Init());
for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
TEST_P(TemporalFilterTest, DISABLED_Speed) {
Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
+ ASSERT_TRUE(a.Init());
const int filter_weight = 2;
const int filter_strength = 6;
for (int height = 8; height <= 16; height += 8) {
// The second buffer must not have any border.
Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+ ASSERT_TRUE(b.Init());
Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+ ASSERT_TRUE(accum_ref.Init());
Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+ ASSERT_TRUE(accum_chk.Init());
Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+ ASSERT_TRUE(count_ref.Init());
Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+ ASSERT_TRUE(count_chk.Init());
a.Set(&rnd_, &ACMRandom::Rand8);
b.Set(&rnd_, &ACMRandom::Rand8);
VarianceParams(4, 3, &vpx_variance16x8_neon),
VarianceParams(3, 4, &vpx_variance8x16_neon),
VarianceParams(3, 3, &vpx_variance8x8_neon),
- VarianceParams(3, 2, &vpx_variance8x4_neon)));
+ VarianceParams(3, 2, &vpx_variance8x4_neon),
+ VarianceParams(2, 3, &vpx_variance4x8_neon),
+ VarianceParams(2, 2, &vpx_variance4x4_neon)));
INSTANTIATE_TEST_CASE_P(
NEON, VpxSubpelVarianceTest,
make_tuple(4, 3, &vpx_sub_pixel_variance16x8_neon, 0),
make_tuple(3, 4, &vpx_sub_pixel_variance8x16_neon, 0),
make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0),
- make_tuple(3, 2, &vpx_sub_pixel_variance8x4_neon, 0)));
+ make_tuple(3, 2, &vpx_sub_pixel_variance8x4_neon, 0),
+ make_tuple(2, 3, &vpx_sub_pixel_variance4x8_neon, 0),
+ make_tuple(2, 2, &vpx_sub_pixel_variance4x4_neon, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+ NEON, VpxSubpelAvgVarianceTest,
+ ::testing::Values(
+ make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0),
+ make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0),
+ make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0),
+ make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0),
+ make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0),
+ make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0),
+ make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0),
+ make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0),
+ make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0),
+ make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0),
+ make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
+ make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
+ make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
#endif // HAVE_NEON
#if HAVE_MSA
# TODO(tomfinegan): Verify file output for all thread runs.
for threads in $(seq $max_threads); do
- eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \
- "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
- "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \
- "${error_resilient}" "${threads}" "$@" \
- ${devnull}
+ if [ "$(vpx_config_option_enabled CONFIG_VP9_HIGHBITDEPTH)" != "yes" ]; then
+ eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
+ "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+ "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
+ "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+ "$@" ${devnull}
+ else
+ eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
+ "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+ "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
+ "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+ "$@" "8" ${devnull}
+ fi
done
}
#include <arm_neon.h>
#include <string.h>
#include "./vpx_config.h"
+#include "vpx_dsp/arm/mem_neon.h"
static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 },
{ 96, 32 }, { 80, 48 },
return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
}
-static INLINE void store4x4(unsigned char *dst, int dst_stride,
- const uint8x8_t a0, const uint8x8_t a1) {
- if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) {
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1);
- } else {
- // Store to the aligned local buffer and memcpy instead of vget_lane_u8
- // which is really really slow.
- uint32_t output_buffer[4];
- vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0);
- vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1);
- vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0);
- vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1);
-
- memcpy(dst, output_buffer, 4);
- dst += dst_stride;
- memcpy(dst, output_buffer + 1, 4);
- dst += dst_stride;
- memcpy(dst, output_buffer + 2, 4);
- dst += dst_stride;
- memcpy(dst, output_buffer + 3, 4);
- }
-}
-
void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr,
int src_pixels_per_line, int xoffset,
int yoffset, unsigned char *dst_ptr,
// secondpass_filter
if (yoffset == 0) { // skip_2ndpass_filter
- store4x4(dst_ptr, dst_pitch, e0, e1);
+ store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
} else {
uint8x8_t f0, f1;
const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
f0 = vqrshrn_n_u16(b0, 7);
f1 = vqrshrn_n_u16(b1, 7);
- store4x4(dst_ptr, dst_pitch, f0, f1);
+ store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(f0, f1));
}
}
#include <arm_neon.h>
#include <string.h>
#include "./vpx_config.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_ports/mem.h"
static const int8_t vp8_sub_pel_filters[8][8] = {
return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
}
-static INLINE void store4x4(unsigned char *dst, int dst_stride,
- const uint8x8_t a0, const uint8x8_t a1) {
- if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) {
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0);
- dst += dst_stride;
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1);
- } else {
- // Store to the aligned local buffer and memcpy instead of vget_lane_u8
- // which is really really slow.
- uint32_t output_buffer[4];
- vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0);
- vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1);
- vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0);
- vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1);
-
- memcpy(dst, output_buffer, 4);
- dst += dst_stride;
- memcpy(dst, output_buffer + 1, 4);
- dst += dst_stride;
- memcpy(dst, output_buffer + 2, 4);
- dst += dst_stride;
- memcpy(dst, output_buffer + 3, 4);
- }
-}
-
static INLINE void filter_add_accumulate(const uint8x16_t a, const uint8x16_t b,
const uint8x8_t filter, uint16x8_t *c,
uint16x8_t *d) {
e0 = vqrshrun_n_s16(d0, 7);
e1 = vqrshrun_n_s16(d1, 7);
- store4x4(dst, dst_stride, e0, e1);
+ store_unaligned_u8q(dst, dst_stride, vcombine_u8(e0, e1));
}
void vp8_sixtap_predict4x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
b2 = vqrshrun_n_s16(e4567, 7);
if (yoffset == 0) { // firstpass_filter4x4_only
- store4x4(dst_ptr, dst_pitch, b0, b2);
+ store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(b0, b2));
return;
}
e0 = vqrshrun_n_s16(d0, 7);
e1 = vqrshrun_n_s16(d1, 7);
- store4x4(dst_ptr, dst_pitch, e0, e1);
+ store_unaligned_u8q(dst_ptr, dst_pitch, vcombine_u8(e0, e1));
}
void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
--- /dev/null
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/skin_detection.h"
+#include "vp8/common/alloccommon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define MODEL_MODE 1
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[5][2] = { { 7463, 9614 },
+ { 6400, 10240 },
+ { 7040, 10240 },
+ { 8320, 9280 },
+ { 6800, 9614 } };
+static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16
+static const int skin_threshold[6] = { 1570636, 1400000, 800000,
+ 800000, 800000, 800000 }; // q18
+
+// Thresholds on luminance.
+static const int y_low = 40;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int evaluate_skin_color_difference(const int cb, const int cr,
+ const int idx) {
+ const int cb_q6 = cb << 6;
+ const int cr_q6 = cr << 6;
+ const int cb_diff_q12 =
+ (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
+ const int cbcr_diff_q12 =
+ (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
+ const int cr_diff_q12 =
+ (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
+ const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+ const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+ const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+ const int skin_diff =
+ skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
+ skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
+ return skin_diff;
+}
+
+// Checks if the input yCbCr values corresponds to skin color.
+int skin_pixel(int y, int cb, int cr, int motion) {
+ if (y < y_low || y > y_high) {
+ return 0;
+ } else {
+ if (MODEL_MODE == 0) {
+ return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
+ } else {
+ int i = 0;
+ // Exit on grey.
+ if (cb == 128 && cr == 128) return 0;
+ // Exit on very strong cb.
+ if (cb > 150 && cr < 110) return 0;
+ for (; i < 5; ++i) {
+ int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
+ if (skin_color_diff < skin_threshold[i + 1]) {
+ if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
+ return 0;
+ } else if (motion == 0 &&
+ skin_color_diff > (skin_threshold[i + 1] >> 1)) {
+ return 0;
+ } else {
+ return 1;
+ }
+ }
+ // Exit if difference is much large than the threshold.
+ if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
+ return 0;
+ }
+ }
+ return 0;
+ }
+ }
+}
+
+int compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+ int stride, int strideuv, int consec_zeromv,
+ int curr_motion_magn) {
+ // No skin if block has been zero/small motion for long consecutive time.
+ if (consec_zeromv > 60 && curr_motion_magn == 0) {
+ return 0;
+ } else {
+ int motion = 1;
+ // Take the average of center 2x2 pixels.
+ const int ysource = (y[7 * stride + 7] + y[7 * stride + 8] +
+ y[8 * stride + 7] + y[8 * stride + 8]) >>
+ 2;
+ const int usource = (u[3 * strideuv + 3] + u[3 * strideuv + 4] +
+ u[4 * strideuv + 3] + u[4 * strideuv + 4]) >>
+ 2;
+ const int vsource = (v[3 * strideuv + 3] + v[3 * strideuv + 4] +
+ v[4 * strideuv + 3] + v[4 * strideuv + 4]) >>
+ 2;
+ if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0;
+ return skin_pixel(ysource, usource, vsource, motion);
+ }
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) {
+ int i, j, mb_row, mb_col, num_bl;
+ VP8_COMMON *const cm = &cpi->common;
+ uint8_t *y;
+ const uint8_t *src_y = cpi->Source->y_buffer;
+ const uint8_t *src_u = cpi->Source->u_buffer;
+ const uint8_t *src_v = cpi->Source->v_buffer;
+ const int src_ystride = cpi->Source->y_stride;
+ const int src_uvstride = cpi->Source->uv_stride;
+
+ YV12_BUFFER_CONFIG skinmap;
+ memset(&skinmap, 0, sizeof(skinmap));
+ if (vp8_yv12_alloc_frame_buffer(&skinmap, cm->Width, cm->Height,
+ VP8BORDERINPIXELS) < 0) {
+ vpx_free_frame_buffer(&skinmap);
+ return;
+ }
+ memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+ y = skinmap.y_buffer;
+ // Loop through blocks and set skin map based on center pixel of block.
+ // Set y to white for skin block, otherwise set to source with gray scale.
+ // Ignore rightmost/bottom boundary blocks.
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) {
+ num_bl = 0;
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) {
+ int is_skin = 0;
+ int consec_zeromv = 0;
+ const int bl_index = mb_row * cm->mb_cols + mb_col;
+ const int bl_index1 = bl_index + 1;
+ const int bl_index2 = bl_index + cm->mb_cols;
+ const int bl_index3 = bl_index2 + 1;
+ consec_zeromv = VPXMIN(cpi->consec_zero_last[bl_index],
+ VPXMIN(cpi->consec_zero_last[bl_index1],
+ VPXMIN(cpi->consec_zero_last[bl_index2],
+ cpi->consec_zero_last[bl_index3])));
+ is_skin = compute_skin_block(src_y, src_u, src_v, src_ystride,
+ src_uvstride, consec_zeromv, 0);
+ for (i = 0; i < 16; i++) {
+ for (j = 0; j < 16; j++) {
+ if (is_skin)
+ y[i * src_ystride + j] = 255;
+ else
+ y[i * src_ystride + j] = src_y[i * src_ystride + j];
+ }
+ }
+ num_bl++;
+ y += 16;
+ src_y += 16;
+ src_u += 8;
+ src_v += 8;
+ }
+ y += (src_ystride << 4) - (num_bl << 4);
+ src_y += (src_ystride << 4) - (num_bl << 4);
+ src_u += (src_uvstride << 3) - (num_bl << 3);
+ src_v += (src_uvstride << 3) - (num_bl << 3);
+ }
+ vp8_write_yuv_frame(yuv_skinmap_file, &skinmap);
+ vpx_free_frame_buffer(&skinmap);
+}
+#endif // OUTPUT_YUV_SKINMAP
--- /dev/null
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_SKIN_DETECTION_H_
+#define VP8_ENCODER_SKIN_DETECTION_H_
+
+#include "vp8/encoder/onyx_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+// #define OUTPUT_YUV_SKINMAP
+
+int skin_pixel(int y, int cb, int cr, int motion);
+
+int compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+ int stride, int strideuv, int consec_zeromv,
+ int curr_motion_magn);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file);
+extern void vp8_write_yuv_frame(FILE *f, YV12_BUFFER_CONFIG *s);
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VP8_ENCODER_SKIN_DETECTION_H_
{ 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 }
};
-DECLARE_PROTECTED(DECLARE_ALIGNED(16, const short,
- vp8_bilinear_filters_x86_8[8][16])) = {
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = {
{ 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
{ 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
{ 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
#include "vp8/common/blockd.h"
#include "onyx_int.h"
#include "vp8/common/systemdependent.h"
+#include "vp8/common/skin_detection.h"
#include "vp8/encoder/quantize.h"
#include "vp8/common/alloccommon.h"
#include "mcomp.h"
#ifdef OUTPUT_YUV_DENOISED
FILE *yuv_denoised_file;
#endif
+#ifdef OUTPUT_YUV_SKINMAP
+FILE *yuv_skinmap_file = NULL;
+#endif
#if 0
FILE *framepsnr;
SPEED_FEATURES *sf = &cpi->sf;
int Mode = cpi->compressor_speed;
int Speed = cpi->Speed;
+ int Speed2;
int i;
VP8_COMMON *cm = &cpi->common;
int last_improved_quant = sf->improved_quant;
cpi->mode_check_freq[THR_V_PRED] = cpi->mode_check_freq[THR_H_PRED] =
cpi->mode_check_freq[THR_B_PRED] =
speed_map(Speed, mode_check_freq_map_vhbpred);
- cpi->mode_check_freq[THR_NEW1] = speed_map(Speed, mode_check_freq_map_new1);
+
+ // For real-time mode at speed 10 keep the mode_check_freq threshold
+ // for NEW1 similar to that of speed 9.
+ Speed2 = Speed;
+ if (cpi->Speed == 10 && Mode == 2) Speed2 = RT(9);
+ cpi->mode_check_freq[THR_NEW1] = speed_map(Speed2, mode_check_freq_map_new1);
+
cpi->mode_check_freq[THR_NEW2] = cpi->mode_check_freq[THR_NEW3] =
speed_map(Speed, mode_check_freq_map_new2);
+
cpi->mode_check_freq[THR_SPLIT1] =
speed_map(Speed, mode_check_freq_map_split1);
cpi->mode_check_freq[THR_SPLIT2] = cpi->mode_check_freq[THR_SPLIT3] =
#ifdef OUTPUT_YUV_DENOISED
yuv_denoised_file = fopen("denoised.yuv", "ab");
#endif
+#ifdef OUTPUT_YUV_SKINMAP
+ yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+#endif
#if 0
framepsnr = fopen("framepsnr.stt", "a");
#ifdef OUTPUT_YUV_DENOISED
fclose(yuv_denoised_file);
#endif
+#ifdef OUTPUT_YUV_SKINMAP
+ fclose(yuv_skinmap_file);
+#endif
#if 0
return 0;
}
-#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED)
+#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \
+ defined(OUTPUT_YUV_SKINMAP)
void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
unsigned char *src = s->y_buffer;
- int h = s->y_height;
+ int h = s->y_crop_height;
do {
fwrite(src, s->y_width, 1, yuv_file);
} while (--h);
src = s->u_buffer;
- h = s->uv_height;
+ h = s->uv_crop_height;
do {
fwrite(src, s->uv_width, 1, yuv_file);
} while (--h);
src = s->v_buffer;
- h = s->uv_height;
+ h = s->uv_crop_height;
do {
fwrite(src, s->uv_width, 1, yuv_file);
}
#endif
+#ifdef OUTPUT_YUV_SKINMAP
+ if (cpi->common.current_video_frame > 1) {
+ compute_skin_map(cpi, yuv_skinmap_file);
+ }
+#endif
+
#if CONFIG_MULTITHREAD
if (cpi->b_multi_threaded) {
/* start loopfilter in separate thread */
#include "vp8/common/reconintra4x4.h"
#include "vpx_dsp/variance.h"
#include "mcomp.h"
+#include "vp8/common/skin_detection.h"
#include "rdopt.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_mem/vpx_mem.h"
extern unsigned int cnt_pm;
#endif
-#define MODEL_MODE 1
-
extern const int vp8_ref_frame_order[MAX_MODES];
extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
-// Fixed point implementation of a skin color classifier. Skin color
-// is model by a Gaussian distribution in the CbCr color space.
-// See ../../test/skin_color_detector_test.cc where the reference
-// skin color classifier is defined.
-
-// Fixed-point skin color model parameters.
-static const int skin_mean[5][2] = { { 7463, 9614 },
- { 6400, 10240 },
- { 7040, 10240 },
- { 8320, 9280 },
- { 6800, 9614 } };
-static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 }; // q16
-static const int skin_threshold[6] = { 1570636, 1400000, 800000,
- 800000, 800000, 800000 }; // q18
-
-// Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr, int idx) {
- const int cb_q6 = cb << 6;
- const int cr_q6 = cr << 6;
- const int cb_diff_q12 =
- (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
- const int cbcr_diff_q12 =
- (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
- const int cr_diff_q12 =
- (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
- const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
- const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
- const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
- const int skin_diff =
- skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
- skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
- return skin_diff;
-}
-
-// Checks if the input yCbCr values corresponds to skin color.
-static int is_skin_color(int y, int cb, int cr, int consec_zeromv) {
- if (y < 40 || y > 220) {
- return 0;
- } else {
- if (MODEL_MODE == 0) {
- return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
- } else {
- int i = 0;
- // No skin if block has been zero motion for long consecutive time.
- if (consec_zeromv > 60) return 0;
- // Exit on grey.
- if (cb == 128 && cr == 128) return 0;
- // Exit on very strong cb.
- if (cb > 150 && cr < 110) return 0;
- for (; i < 5; ++i) {
- int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
- if (skin_color_diff < skin_threshold[i + 1]) {
- if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
- return 0;
- } else if (consec_zeromv > 25 &&
- skin_color_diff > (skin_threshold[i + 1] >> 1)) {
- return 0;
- } else {
- return 1;
- }
- }
- // Exit if difference is much large than the threshold.
- if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
- return 0;
- }
- }
- return 0;
- }
- }
-}
-
static int macroblock_corner_grad(unsigned char *signal, int stride,
int offsetx, int offsety, int sgnx,
int sgny) {
#endif
// Check if current macroblock is in skin area.
- {
- const int y = (x->src.y_buffer[7 * x->src.y_stride + 7] +
- x->src.y_buffer[7 * x->src.y_stride + 8] +
- x->src.y_buffer[8 * x->src.y_stride + 7] +
- x->src.y_buffer[8 * x->src.y_stride + 8]) >>
- 2;
- const int cb = (x->src.u_buffer[3 * x->src.uv_stride + 3] +
- x->src.u_buffer[3 * x->src.uv_stride + 4] +
- x->src.u_buffer[4 * x->src.uv_stride + 3] +
- x->src.u_buffer[4 * x->src.uv_stride + 4]) >>
- 2;
- const int cr = (x->src.v_buffer[3 * x->src.uv_stride + 3] +
- x->src.v_buffer[3 * x->src.uv_stride + 4] +
- x->src.v_buffer[4 * x->src.uv_stride + 3] +
- x->src.v_buffer[4 * x->src.uv_stride + 4]) >>
- 2;
- x->is_skin = 0;
- if (!cpi->oxcf.screen_content_mode) {
- int block_index = mb_row * cpi->common.mb_cols + mb_col;
- x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]);
- }
+ x->is_skin = 0;
+ if (!cpi->oxcf.screen_content_mode) {
+ int block_index = mb_row * cpi->common.mb_cols + mb_col;
+ x->is_skin = compute_skin_block(
+ x->src.y_buffer, x->src.u_buffer, x->src.v_buffer, x->src.y_stride,
+ x->src.uv_stride, cpi->consec_zero_last[block_index], 0);
}
#if CONFIG_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity) {
VP8_CX_SRCS-yes += encoder/rdopt.c
VP8_CX_SRCS-yes += encoder/segmentation.c
VP8_CX_SRCS-yes += encoder/segmentation.h
+VP8_CX_SRCS-yes += common/skin_detection.c
+VP8_CX_SRCS-yes += common/skin_detection.h
VP8_CX_SRCS-yes += encoder/tokenize.c
VP8_CX_SRCS-yes += encoder/dct_value_cost.h
VP8_CX_SRCS-yes += encoder/dct_value_tokens.h
// if mfqe is enabled. Need to take both the quality and the speed
// into consideration.
if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
- vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+ vpx_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
}
if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q,
cm->postproc_state.limits);
} else {
- vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+ vpx_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
}
} else if (flags & VP9D_DEMACROBLOCK) {
deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
} else if (flags & VP9D_DEBLOCK) {
vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits);
} else {
- vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
+ vpx_yv12_copy_frame(cm->frame_to_show, ppbuf);
}
ppstate->last_base_qindex = cm->base_qindex;
vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
"Incorrect buffer dimensions");
else
- vp8_yv12_copy_frame(cfg, sd);
+ vpx_yv12_copy_frame(cfg, sd);
} else {
vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame");
}
"Incorrect buffer dimensions");
} else {
// Overwrite the reference frame buffer.
- vp8_yv12_copy_frame(sd, ref_buf);
+ vpx_yv12_copy_frame(sd, ref_buf);
}
return cm->error.error_code;
#include "vp9/encoder/vp9_rd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/vpx_dsp_common.h"
void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
!cpi->oxcf.gf_cbr_boost_pct) {
// Force this frame as a golden update frame if this frame changes the
// resolution (resize_pending != 0).
- // TODO(marpan): check on forcing golden update if the background has very
- // high motion in current frame.
if (cpi->resize_pending != 0) {
vp9_cyclic_refresh_set_golden_update(cpi);
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
else
rc->baseline_gf_interval = 40;
if (cpi->oxcf.rc_mode == VPX_VBR) rc->baseline_gf_interval = 20;
+ if (rc->avg_frame_low_motion < 50 && rc->frames_since_key > 40)
+ rc->baseline_gf_interval = 10;
}
// Update the segmentation map, and related quantities: cyclic refresh map,
int target_refresh = 0;
double weight_segment_target = 0;
double weight_segment = 0;
+ cr->apply_cyclic_refresh = 1;
+ if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 ||
+ (!cpi->use_svc && rc->avg_frame_low_motion < 55 &&
+ rc->frames_since_key > 40)) {
+ cr->apply_cyclic_refresh = 0;
+ return;
+ }
cr->percent_refresh = 10;
if (cr->reduce_refresh) cr->percent_refresh = 5;
cr->max_qdelta_perc = 60;
const RATE_CONTROL *const rc = &cpi->rc;
CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
struct segmentation *const seg = &cm->seg;
- // TODO(marpan): Look into whether we should reduce the amount/delta-qp
- // instead of completely shutting off at low bitrates. For now keep it on.
- // const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
- const int apply_cyclic_refresh = 1;
if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
- // Don't apply refresh on key frame or temporal enhancement layer frames.
- if (!apply_cyclic_refresh || (cm->frame_type == KEY_FRAME) ||
- (cpi->force_update_segmentation) || (cpi->svc.temporal_layer_id > 0)) {
+ if (!cr->apply_cyclic_refresh || (cpi->force_update_segmentation)) {
// Set segmentation map to 0 and disable.
unsigned char *const seg_map = cpi->segmentation_map;
memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
int qindex_delta[3];
int reduce_refresh;
double weight_segment;
+ int apply_cyclic_refresh;
};
struct VP9_COMP;
}
}
-static void copy_partitioning_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
+static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x,
+ MACROBLOCKD *xd, BLOCK_SIZE bsize,
int mi_row, int mi_col) {
VP9_COMMON *const cm = &cpi->common;
BLOCK_SIZE *prev_part = cpi->prev_partition;
const int bs = (1 << bsl) / 4;
BLOCK_SIZE subsize;
PARTITION_TYPE partition;
- MODE_INFO *mi = NULL;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
partition = partition_lookup[bsl][prev_part[start_pos]];
subsize = get_subsize(bsize, partition);
- mi = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
if (subsize < BLOCK_8X8) {
- mi->sb_type = bsize;
+ set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
} else {
switch (partition) {
- case PARTITION_NONE: mi->sb_type = bsize; break;
+ case PARTITION_NONE:
+ set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
+ break;
case PARTITION_HORZ:
- mi->sb_type = subsize;
- if (mi_row + bs < cm->mi_rows)
- cm->mi_grid_visible[(mi_row + bs) * cm->mi_stride + mi_col]->sb_type =
- subsize;
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row + bs, mi_col, subsize);
break;
case PARTITION_VERT:
- mi->sb_type = subsize;
- if (mi_col + bs < cm->mi_cols)
- cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col + bs]->sb_type =
- subsize;
+ set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+ set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize);
break;
case PARTITION_SPLIT:
- copy_partitioning_helper(cpi, subsize, mi_row, mi_col);
- copy_partitioning_helper(cpi, subsize, mi_row + bs, mi_col);
- copy_partitioning_helper(cpi, subsize, mi_row, mi_col + bs);
- copy_partitioning_helper(cpi, subsize, mi_row + bs, mi_col + bs);
+ copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col);
+ copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col);
+ copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs);
+ copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs);
break;
default: assert(0);
}
}
}
-static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
- int mi_col, int segment_id, int sb_offset) {
- if (cpi->rc.frames_since_key > 1 && segment_id == CR_SEGMENT_ID_BASE &&
+static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+ int mi_row, int mi_col, int segment_id,
+ int sb_offset) {
+ int svc_copy_allowed = 1;
+ int frames_since_key_thresh = 1;
+ if (cpi->use_svc) {
+ // For SVC, don't allow copy if base spatial layer is key frame, or if
+ // frame is not a temporal enhancement layer frame.
+ int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id,
+ cpi->svc.number_temporal_layers);
+ const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+ if (lc->is_key_frame ||
+ (cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1 &&
+ cpi->svc.number_temporal_layers > 1))
+ svc_copy_allowed = 0;
+ frames_since_key_thresh = cpi->svc.number_spatial_layers << 1;
+ }
+ if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed &&
+ !cpi->resize_pending && segment_id == CR_SEGMENT_ID_BASE &&
cpi->prev_segment_id[sb_offset] == CR_SEGMENT_ID_BASE &&
cpi->copied_frame_cnt[sb_offset] < cpi->max_copied_frame) {
if (cpi->prev_partition != NULL) {
- copy_partitioning_helper(cpi, BLOCK_64X64, mi_row, mi_col);
+ copy_partitioning_helper(cpi, x, xd, BLOCK_64X64, mi_row, mi_col);
cpi->copied_frame_cnt[sb_offset] += 1;
memcpy(x->variance_low, &(cpi->prev_variance_low[sb_offset * 25]),
sizeof(x->variance_low));
x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2];
// If source_sad is low copy the partition without computing the y_sad.
if (x->skip_low_source_sad && cpi->sf.copy_partition_flag &&
- copy_partitioning(cpi, x, mi_row, mi_col, segment_id, sb_offset)) {
+ copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
return 0;
}
}
// Stop the copy every cpi->max_copied_frame to refresh the partition.
// TODO(jianj) : tune the threshold.
if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy &&
- copy_partitioning(cpi, x, mi_row, mi_col, segment_id, sb_offset)) {
+ copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
chroma_check(cpi, x, bsize, y_sad, is_key_frame);
return 0;
}
YV12_BUFFER_CONFIG *sd) {
YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
if (cfg) {
- vp8_yv12_copy_frame(cfg, sd);
+ vpx_yv12_copy_frame(cfg, sd);
return 0;
} else {
return -1;
YV12_BUFFER_CONFIG *sd) {
YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
if (cfg) {
- vp8_yv12_copy_frame(sd, cfg);
+ vpx_yv12_copy_frame(sd, cfg);
return 0;
} else {
return -1;
// denoising we will have to modify this.
void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
uint8_t *src = s->y_buffer;
- int h = s->y_height;
+ int h = s->y_crop_height;
do {
fwrite(src, s->y_width, 1, f);
} while (--h);
src = s->u_buffer;
- h = s->uv_height;
+ h = s->uv_crop_height;
do {
fwrite(src, s->uv_width, 1, f);
} while (--h);
src = s->v_buffer;
- h = s->uv_height;
+ h = s->uv_crop_height;
do {
fwrite(src, s->uv_width, 1, f);
if ((cpi->use_svc &&
(cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1 ||
+ cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 ||
cpi->svc.current_superframe < 1)) ||
cpi->resize_pending || cpi->resize_state || cpi->external_resize ||
cpi->resize_state != ORIG) {
// Calculate a modified Error used in distributing bits between easier and
// harder frames.
#define ACT_AREA_CORRECTION 0.5
-static double calculate_modified_err(const VP9_COMP *cpi,
- const TWO_PASS *twopass,
- const VP9EncoderConfig *oxcf,
- const FIRSTPASS_STATS *this_frame) {
+static double calculate_mod_frame_score(const VP9_COMP *cpi,
+ const TWO_PASS *twopass,
+ const VP9EncoderConfig *oxcf,
+ const FIRSTPASS_STATS *this_frame) {
const FIRSTPASS_STATS *const stats = &twopass->total_stats;
const double av_weight = stats->weight / stats->count;
const double av_err = (stats->coded_error * av_weight) / stats->count;
- double modified_error =
+ double modified_score =
av_err * pow(this_frame->coded_error * this_frame->weight /
DOUBLE_DIVIDE_CHECK(av_err),
oxcf->two_pass_vbrbias / 100.0);
// remaining active MBs. The correction here assumes that coding
// 0.5N blocks of complexity 2X is a little easier than coding N
// blocks of complexity X.
- modified_error *=
+ modified_score *=
pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
- return fclamp(modified_error, twopass->modified_error_min,
- twopass->modified_error_max);
+ return modified_score;
+}
+static double calculate_norm_frame_score(const VP9_COMP *cpi,
+ const TWO_PASS *twopass,
+ const VP9EncoderConfig *oxcf,
+ const FIRSTPASS_STATS *this_frame) {
+ const FIRSTPASS_STATS *const stats = &twopass->total_stats;
+ const double av_weight = stats->weight / stats->count;
+ const double av_err = (stats->coded_error * av_weight) / stats->count;
+ double modified_score =
+ av_err * pow(this_frame->coded_error * this_frame->weight /
+ DOUBLE_DIVIDE_CHECK(av_err),
+ oxcf->two_pass_vbrbias / 100.0);
+
+ const double min_score = (double)(oxcf->two_pass_vbrmin_section) / 100.0;
+ const double max_score = (double)(oxcf->two_pass_vbrmax_section) / 100.0;
+
+ // Correction for active area. Frames with a reduced active area
+ // (eg due to formatting bars) have a higher error per mb for the
+ // remaining active MBs. The correction here assumes that coding
+ // 0.5N blocks of complexity 2X is a little easier than coding N
+ // blocks of complexity X.
+ modified_score *=
+ pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
+ // Normalize to a midpoint score.
+ modified_score /= DOUBLE_DIVIDE_CHECK(twopass->mean_mod_score);
+
+ return fclamp(modified_score, min_score, max_score);
}
// This function returns the maximum target rate per frame.
fps->frame = cm->current_video_frame;
fps->spatial_layer_id = cpi->svc.spatial_layer_id;
- fps->coded_error = (double)(fp_acc_data->coded_error >> 8) + min_err;
- fps->sr_coded_error = (double)(fp_acc_data->sr_coded_error >> 8) + min_err;
- fps->intra_error = (double)(fp_acc_data->intra_error >> 8) + min_err;
+
+ fps->coded_error =
+ ((double)(fp_acc_data->coded_error >> 8) + min_err) / num_mbs;
+ fps->sr_coded_error =
+ ((double)(fp_acc_data->sr_coded_error >> 8) + min_err) / num_mbs;
+ fps->intra_error =
+ ((double)(fp_acc_data->intra_error >> 8) + min_err) / num_mbs;
+
fps->frame_noise_energy =
(double)(fp_acc_data->frame_noise_energy) / (double)num_mbs;
fps->count = 1.0;
const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
? cpi->initial_mbs
: cpi->common.MBs;
- const int active_mbs = VPXMAX(1, num_mbs - (int)(num_mbs * inactive_zone));
- const double av_err_per_mb = section_err / active_mbs;
+ const double active_pct = VPXMAX(0.01, 1.0 - inactive_zone);
+ const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct);
+ const double av_err_per_mb = section_err / active_pct;
const double speed_term = 1.0 + 0.04 * oxcf->speed;
double last_group_rate_err;
const int target_norm_bits_per_mb =
// This variable monitors how far behind the second ref update is lagging.
twopass->sr_update_lag = 1;
- // Scan the first pass file and calculate a modified total error based upon
- // the bias/power function used to allocate bits.
+ // Scan the first pass file and calculate a modified score for each
+ // frame that is used to distribute bits. The modified score is assumed
+ // to provide a linear basis for bit allocation. I.e a frame A with a score
+ // that is double that of frame B will be allocated 2x as many bits.
{
- const double avg_error =
- stats->coded_error / DOUBLE_DIVIDE_CHECK(stats->count);
const FIRSTPASS_STATS *s = twopass->stats_in;
- double modified_error_total = 0.0;
- twopass->modified_error_min =
- (avg_error * oxcf->two_pass_vbrmin_section) / 100;
- twopass->modified_error_max =
- (avg_error * oxcf->two_pass_vbrmax_section) / 100;
+ double modified_score_total = 0.0;
+
+ // The first scan is unclamped and gives a raw average.
while (s < twopass->stats_in_end) {
- modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
+ modified_score_total += calculate_mod_frame_score(cpi, twopass, oxcf, s);
++s;
}
- twopass->modified_error_left = modified_error_total;
+
+ // The average error from this first scan is used to define the midpoint
+ // error for the rate distribution function.
+ twopass->mean_mod_score =
+ modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count);
+
+ // Second scan using clamps based on the previous cycle average.
+ // This may modify the total and average somewhat but we dont bother with
+ // further itterations.
+ s = twopass->stats_in;
+ modified_score_total = 0.0;
+ while (s < twopass->stats_in_end) {
+ modified_score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s);
+ ++s;
+ }
+ twopass->normalized_score_left = modified_score_total;
}
// Reset the vbr bits off target counters
static double get_sr_decay_rate(const VP9_COMP *cpi,
const FIRSTPASS_STATS *frame) {
- const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
- : cpi->common.MBs;
- double sr_diff = (frame->sr_coded_error - frame->coded_error) / num_mbs;
+ double sr_diff = (frame->sr_coded_error - frame->coded_error);
double sr_decay = 1.0;
double modified_pct_inter;
double modified_pcnt_intra;
(cpi->initial_height + cpi->initial_width));
modified_pct_inter = frame->pcnt_inter;
- if (((frame->coded_error / num_mbs) > LOW_CODED_ERR_PER_MB) &&
+ if ((frame->coded_error > LOW_CODED_ERR_PER_MB) &&
((frame->intra_error / DOUBLE_DIVIDE_CHECK(frame->coded_error)) <
(double)NCOUNT_FRAME_II_THRESH)) {
modified_pct_inter =
const double lq = vp9_convert_qindex_to_q(
cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
const double boost_q_correction = VPXMIN((0.5 + (lq * 0.015)), 1.5);
- int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
- : cpi->common.MBs;
-
- // Correct for any inactive region in the image
- num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+ const double active_area = calculate_active_area(cpi, this_frame);
// Underlying boost factor is based on inter error ratio.
- frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
+ frame_boost = (BASELINE_ERR_PER_MB * active_area) /
DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
// Update the accumulator for second ref error difference.
// This is intended to give an indication of how much the coded error is
// increasing over time.
- *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+ *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
*sr_accumulator = VPXMAX(0.0, *sr_accumulator);
// Small adjustment for cases where there is a zoom out
const double lq = vp9_convert_qindex_to_q(
cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00);
- int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
- : cpi->common.MBs;
-
- // Correct for any inactive region in the image
- num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame));
+ const double active_area = calculate_active_area(cpi, this_frame);
// Underlying boost factor is based on inter error ratio.
- frame_boost = (KF_BASELINE_ERR_PER_MB * num_mbs) /
+ frame_boost = (KF_BASELINE_ERR_PER_MB * active_area) /
DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
// Update the accumulator for second ref error difference.
// This is intended to give an indication of how much the coded error is
// increasing over time.
- *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1;
+ *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
*sr_accumulator = VPXMAX(0.0, *sr_accumulator);
// Small adjustment for cases where there is a zoom out
int64_t total_group_bits;
// Calculate the bits to be allocated to the group as a whole.
- if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0)) {
+ if ((twopass->kf_group_bits > 0) && (twopass->kf_group_error_left > 0.0)) {
total_group_bits = (int64_t)(twopass->kf_group_bits *
(gf_group_err / twopass->kf_group_error_left));
} else {
vp9_zero(next_frame);
// Load stats for the current frame.
- mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
// Note the error of the frame at the start of the group. This will be
// the GF frame error if we code a normal gf.
++i;
// Accumulate error score of frames in this gf group.
- mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
gf_group_err += mod_frame_err;
gf_group_raw_error += this_frame->coded_error;
gf_group_noise += this_frame->frame_noise_energy;
int j;
for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
if (EOF == input_stats(twopass, this_frame)) break;
- gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ gf_group_err +=
+ calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
gf_group_raw_error += this_frame->coded_error;
gf_group_noise += this_frame->frame_noise_energy;
gf_group_skip_pct += this_frame->intra_skip_pct;
gf_group_bits);
// Adjust KF group bits and error remaining.
- twopass->kf_group_error_left -= (int64_t)gf_group_err;
+ twopass->kf_group_error_left -= gf_group_err;
// Allocate bits to each of the frames in the GF group.
allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits);
#define II_IMPROVEMENT_THRESHOLD 3.5
#define KF_II_MAX 128.0
#define II_FACTOR 12.5
+// Test for very low intra complexity which could cause false key frames
+#define V_LOW_INTRA 0.5
+
static int test_candidate_kf(TWO_PASS *twopass,
const FIRSTPASS_STATS *last_frame,
const FIRSTPASS_STATS *this_frame,
0.20) &&
(next_iiratio < 3.0)) ||
((boost_score - old_boost_score) < 3.0) ||
- (local_next_frame.intra_error < 200)) {
+ (local_next_frame.intra_error < V_LOW_INTRA)) {
break;
}
rc->frames_to_key = 1;
- twopass->kf_group_bits = 0; // Total bits available to kf group
- twopass->kf_group_error_left = 0; // Group modified error score.
+ twopass->kf_group_bits = 0; // Total bits available to kf group
+ twopass->kf_group_error_left = 0.0; // Group modified error score.
- kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ kf_mod_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
// Initialize the decay rates for the recent frames to check
for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
while (twopass->stats_in < twopass->stats_in_end &&
rc->frames_to_key < cpi->oxcf.key_freq) {
// Accumulate kf group error.
- kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
// Load the next frame's stats.
last_frame = *this_frame;
// Rescan to get the correct error data for the forced kf group.
for (i = 0; i < rc->frames_to_key; ++i) {
- kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
+ kf_group_err +=
+ calculate_norm_frame_score(cpi, twopass, oxcf, &tmp_frame);
input_stats(twopass, &tmp_frame);
}
rc->next_key_frame_forced = 1;
int j;
for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
if (EOF == input_stats(twopass, this_frame)) break;
- kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ kf_group_err +=
+ calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
}
rc->frames_to_key = new_frame_to_key;
}
// Special case for the last key frame of the file.
if (twopass->stats_in >= twopass->stats_in_end) {
// Accumulate kf group error.
- kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
+ kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
}
// Calculate the number of bits that should be assigned to the kf group.
- if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) {
+ if (twopass->bits_left > 0 && twopass->normalized_score_left > 0.0) {
// Maximum number of bits for a single normal frame (not key frame).
const int max_bits = frame_max_bits(rc, &cpi->oxcf);
// Default allocation based on bits left and relative
// complexity of the section.
twopass->kf_group_bits = (int64_t)(
- twopass->bits_left * (kf_group_err / twopass->modified_error_left));
+ twopass->bits_left * (kf_group_err / twopass->normalized_score_left));
// Clip based on maximum per frame rate defined by the user.
max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key;
gf_group->rf_level[0] = KF_STD;
// Note the total error score of the kf group minus the key frame itself.
- twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+ twopass->kf_group_error_left = (kf_group_err - kf_mod_err);
// Adjust the count of total modified error left.
// The count of bits left is adjusted elsewhere based on real coded frame
// sizes.
- twopass->modified_error_left -= kf_group_err;
+ twopass->normalized_score_left -= kf_group_err;
if (oxcf->resize_mode == RESIZE_DYNAMIC) {
// Default to normal-sized frame on keyframes.
target_rate = gf_group->bit_allocation[gf_group->index];
rc->base_frame_target = target_rate;
- {
- const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
- ? cpi->initial_mbs
- : cpi->common.MBs;
- // The multiplication by 256 reverses a scaling factor of (>> 8)
- // applied when combining MB error values for the frame.
- twopass->mb_av_energy =
- log(((this_frame.intra_error * 256.0) / num_mbs) + 1.0);
- twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
- }
+ // The multiplication by 256 reverses a scaling factor of (>> 8)
+ // applied when combining MB error values for the frame.
+ twopass->mb_av_energy = log((this_frame.intra_error * 256.0) + 1.0);
+ twopass->mb_smooth_pct = this_frame.intra_smooth_pct;
// Update the total stats remaining structure.
subtract_stats(&twopass->total_left_stats, &this_frame);
FIRSTPASS_STATS total_left_stats;
int first_pass_done;
int64_t bits_left;
- double modified_error_min;
- double modified_error_max;
- double modified_error_left;
+ double mean_mod_score;
+ double normalized_score_left;
double mb_av_energy;
double mb_smooth_pct;
int64_t kf_group_bits;
// Error score of frames still to be coded in kf group
- int64_t kf_group_error_left;
+ double kf_group_error_left;
double bpm_factor;
int rolling_arf_group_target_bits;
#endif // CONFIG_VP9_HIGHBITDEPTH
}
-static INLINE int divide_and_round(const int n, const int d) {
+static INLINE int64_t divide_and_round(const int64_t n, const int64_t d) {
return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
}
// y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
// The code below is an integerized version of that.
static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
- *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
- (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
- *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
- (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+ const int64_t x0 = (int64_t)cost_list[1] - cost_list[3];
+ const int64_t y0 = cost_list[1] - 2 * (int64_t)cost_list[0] + cost_list[3];
+ const int64_t x1 = (int64_t)cost_list[4] - cost_list[2];
+ const int64_t y1 = cost_list[4] - 2 * (int64_t)cost_list[0] + cost_list[2];
+ const int b = 1 << (bits - 1);
+ *ic = (int)divide_and_round(x0 * b, y0);
+ *ir = (int)divide_and_round(x1 * b, y1);
}
uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv,
cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
int ir, ic;
- unsigned int minpt;
+ unsigned int minpt = INT_MAX;
get_cost_surf_min(cost_list, &ir, &ic, 2);
if (ir != 0 || ic != 0) {
CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
const MvLimits tmp_mv_limits = x->mv_limits;
int rv = 0;
int cost_list[5];
+ int search_subpel = 1;
const YV12_BUFFER_CONFIG *scaled_ref_frame =
vp9_get_scaled_ref_frame(cpi, ref);
if (scaled_ref_frame) {
rv =
!(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar);
- if (rv) {
+ // For SVC on non-reference frame, avoid subpel for (0, 0) motion.
+ if (cpi->use_svc && cpi->svc.non_reference_frame) {
+ if (mvp_full.row == 0 && mvp_full.col == 0) search_subpel = 0;
+ }
+
+ if (rv && search_subpel) {
const int subpel_force_stop = cpi->sf.mv.subpel_force_stop;
cpi->find_fractional_mv_step(
x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv,
cpi->oxcf.rc_mode == VPX_CBR) {
int tmp_sad;
uint32_t dis;
- int cost_list[5];
+ int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
if (bsize < BLOCK_16X16) continue;
int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
int active_best_quality, int active_worst_quality) {
const VP9_COMMON *const cm = &cpi->common;
+ CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
int q = active_worst_quality;
int last_error = INT_MAX;
int i, target_bits_per_mb, bits_per_mb_at_this_q;
do {
if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled &&
- cpi->svc.temporal_layer_id == 0 &&
+ cr->apply_cyclic_refresh &&
(!cpi->oxcf.gf_cbr_boost_pct || !cpi->refresh_golden_frame)) {
bits_per_mb_at_this_q =
(int)vp9_cyclic_refresh_rc_bits_per_mb(cpi, i, correction_factor);
if (cpi->svc.temporal_layer_id > 0) {
sf->adaptive_rd_thresh = 4;
sf->limit_newmv_early_exit = 0;
- sf->mv.subpel_force_stop = (cpi->svc.temporal_layer_id == 1) ? 1 : 2;
sf->base_mv_aggressive =
(cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
? 1
sf->mv.search_method = NSTEP;
sf->mv.fullpel_search_step_param = 6;
}
+ if (cpi->svc.temporal_layer_id > 0)
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
if (!cpi->external_resize) sf->use_source_sad = 1;
if (sf->use_source_sad) {
if (cpi->content_state_sb_fd == NULL &&
(cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t));
}
}
+ // Enable partition copy. For SVC only enabled for top spatial resolution
+ // layer.
+ cpi->max_copied_frame = 0;
+ if (!cpi->last_frame_dropped && cpi->resize_state == ORIG &&
+ !cpi->external_resize &&
+ (!cpi->use_svc ||
+ cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+ sf->copy_partition_flag = 1;
+ cpi->max_copied_frame = 2;
+ // The top temporal enhancement layer (for number of temporal layers > 1)
+ // are non-reference frames, so use large/max value for max_copied_frame.
+ if (cpi->svc.number_temporal_layers > 1 &&
+ cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
+ cpi->max_copied_frame = 255;
+ }
}
if (speed >= 8) {
sf->adaptive_rd_thresh = 4;
- // Enable partition copy
- if (!cpi->last_frame_dropped && !cpi->use_svc && !cpi->resize_pending &&
- cpi->resize_state == ORIG && !cpi->external_resize &&
- cpi->oxcf.resize_mode == RESIZE_NONE) {
- sf->copy_partition_flag = 1;
- cpi->max_copied_frame = 4;
- }
-
+ if (!cpi->use_svc) cpi->max_copied_frame = 4;
if (cpi->row_mt && cpi->oxcf.max_threads > 1)
sf->adaptive_rd_thresh_row_mt = 1;
if (content == VP9E_CONTENT_SCREEN)
sf->mv.subpel_force_stop = 3;
- else if (cm->width > 352 && cm->height > 288)
+ else if (cm->width * cm->height > 352 * 288) {
sf->mv.subpel_force_stop = 2;
+ if (cpi->rc.avg_frame_low_motion > 87 && cm->current_video_frame > 30)
+ sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
+ }
if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
// Only keep INTRA_DC mode for speed 8.
}
// Since the short_circuit_low_temp_var is used, reduce the
// adaptive_rd_thresh level.
- if (cm->width > 352 && cm->height > 288)
+ if (cm->width * cm->height > 352 * 288)
sf->adaptive_rd_thresh = 1;
else
sf->adaptive_rd_thresh = 2;
svc->scaled_temp_is_alloc = 0;
svc->scaled_one_half = 0;
svc->current_superframe = 0;
+ svc->non_reference_frame = 0;
for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
svc->ext_frame_flags[sl] = 0;
}
}
+ cpi->svc.non_reference_frame = 0;
+ if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame &&
+ !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) {
+ cpi->svc.non_reference_frame = 1;
+ }
+
if (vp9_set_size_literal(cpi, width, height) != 0)
return VPX_CODEC_INVALID_PARAM;
int ref_frame_index[REF_FRAMES];
int force_zero_mode_spatial_ref;
int current_superframe;
+ int non_reference_frame;
int use_base_mv;
// Used to control the downscaling filter for source scaling, for 1 pass CBR.
// downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
#include "vpx/vpx_integer.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
const uint32x4_t a = vpaddlq_u16(v_16x8);
--- /dev/null
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ if (width > 8) {
+ int x, y;
+ for (y = 0; y < height; ++y) {
+ for (x = 0; x < width; x += 16) {
+ const uint8x16_t p = vld1q_u8(pred + x);
+ const uint8x16_t r = vld1q_u8(ref + x);
+ const uint8x16_t avg = vrhaddq_u8(p, r);
+ vst1q_u8(comp + x, avg);
+ }
+ comp += width;
+ pred += width;
+ ref += ref_stride;
+ }
+ } else {
+ int i;
+ for (i = 0; i < width * height; i += 16) {
+ const uint8x16_t p = vld1q_u8(pred);
+ uint8x16_t r;
+
+ if (width == 4) {
+ r = load_unaligned_u8q(ref, ref_stride);
+ ref += 4 * ref_stride;
+ } else {
+ const uint8x8_t r_0 = vld1_u8(ref);
+ const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+ assert(width == 8);
+ r = vcombine_u8(r_0, r_1);
+ ref += 2 * ref_stride;
+ }
+ r = vrhaddq_u8(r, p);
+ vst1q_u8(comp, r);
+
+ pred += 16;
+ comp += 16;
+ }
+ }
+}
--- /dev/null
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
+ int i;
+ // input[M * stride] * 16
+ int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+ int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+ int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+ int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+ // If the very first value != 0, then add 1.
+ if (input[0] != 0) {
+ const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+ input_0 = vadd_s16(input_0, one);
+ }
+
+ for (i = 0; i < 2; ++i) {
+ const int16x8_t input_01 = vcombine_s16(input_0, input_1);
+ const int16x8_t input_32 = vcombine_s16(input_3, input_2);
+
+ // in_0 +/- in_3, in_1 +/- in_2
+ const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+ const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+ // step_0 +/- step_1, step_2 +/- step_3
+ const int16x4_t s_0 = vget_low_s16(s_01);
+ const int16x4_t s_1 = vget_high_s16(s_01);
+ const int16x4_t s_2 = vget_high_s16(s_32);
+ const int16x4_t s_3 = vget_low_s16(s_32);
+
+ // (s_0 +/- s_1) * cospi_16_64
+ // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+ const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+ const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+ const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int16_t)cospi_16_64);
+ const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int16_t)cospi_16_64);
+
+ // fdct_round_shift
+ int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+ int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+ // s_3 * cospi_8_64 + s_2 * cospi_24_64
+ // s_3 * cospi_24_64 - s_2 * cospi_8_64
+ const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int16_t)cospi_8_64);
+ const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int16_t)cospi_24_64);
+
+ const int32x4_t temp3 =
+ vmlal_n_s16(s_3_cospi_8_64, s_2, (int16_t)cospi_24_64);
+ const int32x4_t temp4 =
+ vmlsl_n_s16(s_3_cospi_24_64, s_2, (int16_t)cospi_8_64);
+
+ // fdct_round_shift
+ int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+ int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+ transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+
+ input_0 = out_0;
+ input_1 = out_1;
+ input_2 = out_2;
+ input_3 = out_3;
+ }
+
+ {
+ // Not quite a rounding shift. Only add 1 despite shifting by 2.
+ const int16x8_t one = vdupq_n_s16(1);
+ int16x8_t out_01 = vcombine_s16(input_0, input_1);
+ int16x8_t out_23 = vcombine_s16(input_2, input_3);
+ out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+ out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+ store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+ store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+ }
+}
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
int stride) {
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
int h, int bd) {
const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
// + 1 to make it divisible by 4
- DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
+ uint16_t temp[64 * 136];
const int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
int w, int h, int bd) {
const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
// + 1 to make it divisible by 4
- DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
+ uint16_t temp[64 * 136];
const int intermediate_height =
(((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/txfm_common.h"
static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
#include <assert.h>
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/inv_txfm.h"
static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/txfm_common.h"
void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
const uint8_t *dst = dest;
const int16x4_t cospis = vld1_s16(kCospi);
- uint32x2_t dest01_u32 = vdup_n_u32(0);
+ uint8x8_t dest01_u8;
uint32x2_t dest32_u32 = vdup_n_u32(0);
int16x8_t a0, a1;
uint8x8_t d01, d32;
a0 = vrshrq_n_s16(a0, 4);
a1 = vrshrq_n_s16(a1, 4);
- dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 0);
- dst += stride;
- dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 1);
- dst += stride;
+ dest01_u8 = load_u8(dst, stride);
+ dst += 2 * stride;
+ // The elements are loaded in reverse order.
dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1);
dst += stride;
dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0);
- d01_u16 =
- vaddw_u8(vreinterpretq_u16_s16(a0), vreinterpret_u8_u32(dest01_u32));
+ d01_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), dest01_u8);
d32_u16 =
vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32));
d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16));
d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16));
- vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 0);
- dest += stride;
- vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 1);
- dest += stride;
+ store_u8(dest, stride, d01);
+ dest += 2 * stride;
+ // The elements are stored in reverse order.
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1);
dest += stride;
vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0);
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/vpx_dsp_common.h"
-DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
+static const int16_t kCospi[16] = {
16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
};
-DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = {
+static const int32_t kCospi32[16] = {
16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
};
-//------------------------------------------------------------------------------
-// Helper functions used to load tran_low_t into int16, narrowing if necessary.
-
-static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int32x4x2_t v0 = vld2q_s32(buf);
- const int32x4x2_t v1 = vld2q_s32(buf + 8);
- const int16x4_t s0 = vmovn_s32(v0.val[0]);
- const int16x4_t s1 = vmovn_s32(v0.val[1]);
- const int16x4_t s2 = vmovn_s32(v1.val[0]);
- const int16x4_t s3 = vmovn_s32(v1.val[1]);
- int16x8x2_t res;
- res.val[0] = vcombine_s16(s0, s2);
- res.val[1] = vcombine_s16(s1, s3);
- return res;
-#else
- return vld2q_s16(buf);
-#endif
-}
-
-static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int32x4_t v0 = vld1q_s32(buf);
- const int32x4_t v1 = vld1q_s32(buf + 4);
- const int16x4_t s0 = vmovn_s32(v0);
- const int16x4_t s1 = vmovn_s32(v1);
- return vcombine_s16(s0, s1);
-#else
- return vld1q_s16(buf);
-#endif
-}
-
-static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int32x4_t v0 = vld1q_s32(buf);
- return vmovn_s32(v0);
-#else
- return vld1_s16(buf);
-#endif
-}
-
-static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
-#if CONFIG_VP9_HIGHBITDEPTH
- const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
- const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
- vst1q_s32(buf, v0);
- vst1q_s32(buf + 4, v1);
-#else
- vst1q_s16(buf, a);
-#endif
-}
-
//------------------------------------------------------------------------------
// Use saturating add/sub to avoid overflow in 2nd pass in high bit-depth
static INLINE int16x8_t final_add(const int16x8_t a, const int16x8_t b) {
--- /dev/null
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_MEM_NEON_H_
+#define VPX_DSP_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Helper functions used to load tran_low_t into int16, narrowing if necessary.
+static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4x2_t v0 = vld2q_s32(buf);
+ const int32x4x2_t v1 = vld2q_s32(buf + 8);
+ const int16x4_t s0 = vmovn_s32(v0.val[0]);
+ const int16x4_t s1 = vmovn_s32(v0.val[1]);
+ const int16x4_t s2 = vmovn_s32(v1.val[0]);
+ const int16x4_t s3 = vmovn_s32(v1.val[1]);
+ int16x8x2_t res;
+ res.val[0] = vcombine_s16(s0, s2);
+ res.val[1] = vcombine_s16(s1, s3);
+ return res;
+#else
+ return vld2q_s16(buf);
+#endif
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vld1q_s32(buf);
+ const int32x4_t v1 = vld1q_s32(buf + 4);
+ const int16x4_t s0 = vmovn_s32(v0);
+ const int16x4_t s1 = vmovn_s32(v1);
+ return vcombine_s16(s0, s1);
+#else
+ return vld1q_s16(buf);
+#endif
+}
+
+static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vld1q_s32(buf);
+ return vmovn_s32(v0);
+#else
+ return vld1_s16(buf);
+#endif
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+ const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+ vst1q_s32(buf, v0);
+ vst1q_s32(buf + 4, v1);
+#else
+ vst1q_s16(buf, a);
+#endif
+}
+
+// Propagate type information to the compiler. Without this the compiler may
+// assume the required alignment of uint32_t (4 bytes) and add alignment hints
+// to the memory access.
+//
+// This is used for functions operating on uint8_t which wish to load or store 4
+// values at a time but which may not be on 4 byte boundaries.
+static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
+ memcpy(buf, &a, 4);
+}
+
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+ uint32_t a;
+ uint32x2_t a_u32 = vdup_n_u32(0);
+ if (stride == 4) return vld1_u8(buf);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vld1_lane_u32(&a, a_u32, 0);
+ memcpy(&a, buf, 4);
+ a_u32 = vld1_lane_u32(&a, a_u32, 1);
+ return vreinterpret_u8_u32(a_u32);
+}
+
+// Store 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8(uint8_t *buf, int stride,
+ const uint8x8_t a) {
+ const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+ if (stride == 4) {
+ vst1_u8(buf, a);
+ return;
+ }
+ uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
+ buf += stride;
+ uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
+}
+
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+ uint32_t a;
+ uint32x4_t a_u32 = vdupq_n_u32(0);
+ if (stride == 4) return vld1q_u8(buf);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vld1q_lane_u32(&a, a_u32, 0);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vld1q_lane_u32(&a, a_u32, 1);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vld1q_lane_u32(&a, a_u32, 2);
+ memcpy(&a, buf, 4);
+ buf += stride;
+ a_u32 = vld1q_lane_u32(&a, a_u32, 3);
+ return vreinterpretq_u8_u32(a_u32);
+}
+
+// Store 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8q(uint8_t *buf, int stride,
+ const uint8x16_t a) {
+ const uint32x4_t a_u32 = vreinterpretq_u32_u8(a);
+ if (stride == 4) {
+ vst1q_u8(buf, a);
+ return;
+ }
+ uint32_to_mem(buf, vgetq_lane_u32(a_u32, 0));
+ buf += stride;
+ uint32_to_mem(buf, vgetq_lane_u32(a_u32, 1));
+ buf += stride;
+ uint32_to_mem(buf, vgetq_lane_u32(a_u32, 2));
+ buf += stride;
+ uint32_to_mem(buf, vgetq_lane_u32(a_u32, 3));
+}
+
+// Load 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE uint8x8_t load_u8(const uint8_t *buf, int stride) {
+ uint32x2_t a = vdup_n_u32(0);
+
+ assert(!((intptr_t)buf % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ a = vld1_lane_u32((const uint32_t *)buf, a, 0);
+ buf += stride;
+ a = vld1_lane_u32((const uint32_t *)buf, a, 1);
+ return vreinterpret_u8_u32(a);
+}
+
+// Store 2 sets of 4 bytes when alignment is guaranteed.
+static INLINE void store_u8(uint8_t *buf, int stride, const uint8x8_t a) {
+ uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+
+ assert(!((intptr_t)buf % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
+
+ vst1_lane_u32((uint32_t *)buf, a_u32, 0);
+ buf += stride;
+ vst1_lane_u32((uint32_t *)buf, a_u32, 1);
+}
+#endif // VPX_DSP_ARM_MEM_NEON_H_
#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
-#include "vpx_ports/mem.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/variance.h"
+#include "vpx_dsp/arm/mem_neon.h"
static const uint8_t bilinear_filters[8][2] = {
{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
{ 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
};
+// Process a block exactly 4 wide and a multiple of 2 high.
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ const uint8_t *filter) {
+ const uint8x8_t f0 = vdup_n_u8(filter[0]);
+ const uint8x8_t f1 = vdup_n_u8(filter[1]);
+ unsigned int i;
+ for (i = 0; i < output_height; i += 2) {
+ const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
+ const uint8x8_t src_1 =
+ load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
+ const uint16x8_t a = vmull_u8(src_0, f0);
+ const uint16x8_t b = vmlal_u8(a, src_1, f1);
+ const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+ vst1_u8(output_ptr, out);
+ src_ptr += 2 * src_pixels_per_line;
+ output_ptr += 8;
+ }
+}
+
// Process a block exactly 8 wide and any height.
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
uint8_t *output_ptr,
int pixel_step,
unsigned int output_height,
const uint8_t *filter) {
- const uint8x8_t f0 = vmov_n_u8(filter[0]);
- const uint8x8_t f1 = vmov_n_u8(filter[1]);
+ const uint8x8_t f0 = vdup_n_u8(filter[0]);
+ const uint8x8_t f1 = vdup_n_u8(filter[1]);
unsigned int i;
for (i = 0; i < output_height; ++i) {
const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
const uint16x8_t a = vmull_u8(src_0, f0);
const uint16x8_t b = vmlal_u8(a, src_1, f1);
const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
- vst1_u8(&output_ptr[0], out);
- // Next row...
+ vst1_u8(output_ptr, out);
src_ptr += src_pixels_per_line;
output_ptr += 8;
}
unsigned int output_height,
unsigned int output_width,
const uint8_t *filter) {
- const uint8x8_t f0 = vmov_n_u8(filter[0]);
- const uint8x8_t f1 = vmov_n_u8(filter[1]);
+ const uint8x8_t f0 = vdup_n_u8(filter[0]);
+ const uint8x8_t f1 = vdup_n_u8(filter[1]);
unsigned int i, j;
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; j += 16) {
const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
- vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+ vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
}
- // Next row...
src_ptr += src_pixels_per_line;
output_ptr += output_width;
}
}
-// TODO(johannkoenig): support 4xM block sizes.
-#define sub_pixel_varianceNxM(n, m) \
- unsigned int vpx_sub_pixel_variance##n##x##m##_neon( \
- const uint8_t *src, int src_stride, int xoffset, int yoffset, \
- const uint8_t *dst, int dst_stride, unsigned int *sse) { \
- DECLARE_ALIGNED(16, uint8_t, fdata3[n * (m + 1)]); \
- DECLARE_ALIGNED(16, uint8_t, temp2[n * m]); \
- \
- if (n == 8) { \
- var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, (m + 1), \
- bilinear_filters[xoffset]); \
- var_filter_block2d_bil_w8(fdata3, temp2, n, n, m, \
- bilinear_filters[yoffset]); \
- } else { \
- var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, (m + 1), n, \
- bilinear_filters[xoffset]); \
- var_filter_block2d_bil_w16(fdata3, temp2, n, n, m, n, \
- bilinear_filters[yoffset]); \
- } \
- return vpx_variance##n##x##m(temp2, n, dst, dst_stride, sse); \
+// 4xM filter writes an extra row to fdata because it processes two rows at a
+// time.
+#define sub_pixel_varianceNxM(n, m) \
+ uint32_t vpx_sub_pixel_variance##n##x##m##_neon( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse) { \
+ uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \
+ uint8_t temp1[n * m]; \
+ \
+ if (n == 4) { \
+ var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \
+ bilinear_filters[yoffset]); \
+ } else if (n == 8) { \
+ var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \
+ bilinear_filters[yoffset]); \
+ } else { \
+ var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \
+ bilinear_filters[yoffset]); \
+ } \
+ return vpx_variance##n##x##m(temp1, n, b, b_stride, sse); \
}
+sub_pixel_varianceNxM(4, 4);
+sub_pixel_varianceNxM(4, 8);
sub_pixel_varianceNxM(8, 4);
sub_pixel_varianceNxM(8, 8);
sub_pixel_varianceNxM(8, 16);
sub_pixel_varianceNxM(32, 64);
sub_pixel_varianceNxM(64, 32);
sub_pixel_varianceNxM(64, 64);
+
+// 4xM filter writes an extra row to fdata because it processes two rows at a
+// time.
+#define sub_pixel_avg_varianceNxM(n, m) \
+ uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred) { \
+ uint8_t temp0[n * (m + (n == 4 ? 2 : 1))]; \
+ uint8_t temp1[n * m]; \
+ \
+ if (n == 4) { \
+ var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2), \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w4(temp0, temp1, n, n, m, \
+ bilinear_filters[yoffset]); \
+ } else if (n == 8) { \
+ var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1), \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w8(temp0, temp1, n, n, m, \
+ bilinear_filters[yoffset]); \
+ } else { \
+ var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
+ bilinear_filters[xoffset]); \
+ var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n, \
+ bilinear_filters[yoffset]); \
+ } \
+ \
+ vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n); \
+ \
+ return vpx_variance##n##x##m(temp0, n, b, b_stride, sse); \
+ }
+
+sub_pixel_avg_varianceNxM(4, 4);
+sub_pixel_avg_varianceNxM(4, 8);
+sub_pixel_avg_varianceNxM(8, 4);
+sub_pixel_avg_varianceNxM(8, 8);
+sub_pixel_avg_varianceNxM(8, 16);
+sub_pixel_avg_varianceNxM(16, 8);
+sub_pixel_avg_varianceNxM(16, 16);
+sub_pixel_avg_varianceNxM(16, 32);
+sub_pixel_avg_varianceNxM(32, 16);
+sub_pixel_avg_varianceNxM(32, 32);
+sub_pixel_avg_varianceNxM(32, 64);
+sub_pixel_avg_varianceNxM(64, 32);
+sub_pixel_avg_varianceNxM(64, 64);
*/
#include <arm_neon.h>
+#include <assert.h>
#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_ports/mem.h"
static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
return vget_lane_s32(c, 0);
}
-// w * h must be less than 2048 or sum_s16 may overflow.
+// The variance helper functions use int16_t for sum. 8 values are accumulated
+// and then added (at which point they expand up to int32_t). To avoid overflow,
+// there can be no more than 32767 / 255 ~= 128 values accumulated in each
+// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32
+// rows = 128. Asserts have been added to each function to warn against reaching
+// this limit.
+
+// Process a block of width 4 four rows at a time.
+static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int h, uint32_t *sse, int *sum) {
+ int i;
+ int16x8_t sum_s16 = vdupq_n_s16(0);
+ int32x4_t sse_lo_s32 = vdupq_n_s32(0);
+ int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+
+ // Since width is only 4, sum_s16 only loads a half row per loop.
+ assert(h <= 256);
+
+ for (i = 0; i < h; i += 4) {
+ const uint8x16_t a_u8 = load_unaligned_u8q(a, a_stride);
+ const uint8x16_t b_u8 = load_unaligned_u8q(b, b_stride);
+ const uint16x8_t diff_lo_u16 =
+ vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
+ const uint16x8_t diff_hi_u16 =
+ vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
+
+ const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
+ const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
+
+ sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
+ sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+
+ sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
+ vget_low_s16(diff_lo_s16));
+ sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
+ vget_high_s16(diff_lo_s16));
+
+ sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
+ vget_low_s16(diff_hi_s16));
+ sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
+ vget_high_s16(diff_hi_s16));
+
+ a += 4 * a_stride;
+ b += 4 * b_stride;
+ }
+
+ *sum = horizontal_add_s16x8(sum_s16);
+ *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
+}
+
// Process a block of any size where the width is divisible by 16.
static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int w, int h, uint32_t *sse,
int32x4_t sse_lo_s32 = vdupq_n_s32(0);
int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+ // The loop loads 16 values at a time but doubles them up when accumulating
+ // into sum_s16.
+ assert(w / 8 * h <= 128);
+
for (i = 0; i < h; ++i) {
for (j = 0; j < w; j += 16) {
const uint8x16_t a_u8 = vld1q_u8(a + j);
*sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
}
-// w * h must be less than 2048 or sum_s16 may overflow.
// Process a block of width 8 two rows at a time.
static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int h, uint32_t *sse, int *sum) {
int32x4_t sse_lo_s32 = vdupq_n_s32(0);
int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+ // Each column has it's own accumulator entry in sum_s16.
+ assert(h <= 128);
+
do {
const uint8x8_t a_0_u8 = vld1_u8(a);
const uint8x8_t a_1_u8 = vld1_u8(a + a_stride);
const uint8_t *b, int b_stride, \
unsigned int *sse) { \
int sum; \
- if (n == 8) \
+ if (n == 4) \
+ variance_neon_w4x4(a, a_stride, b, b_stride, m, sse, &sum); \
+ else if (n == 8) \
variance_neon_w8x2(a, a_stride, b, b_stride, m, sse, &sum); \
else \
variance_neon_w16(a, a_stride, b, b_stride, n, m, sse, &sum); \
return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
}
+varianceNxM(4, 4, 4);
+varianceNxM(4, 8, 5);
varianceNxM(8, 4, 5);
varianceNxM(8, 8, 6);
varianceNxM(8, 16, 7);
/* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
* maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
*/
- DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+ uint8_t temp[64 * 72];
// Account for the vertical phase needing 3 lines prior and 4 lines post
const int intermediate_height = h + 7;
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4, int w,
int h) {
- DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+ uint8_t temp[64 * 72];
const int intermediate_height = h + 7;
assert(y_step_q4 == 16);
*/
#include <assert.h>
#include <stdlib.h>
-#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
-DECLARE_PROTECTED(const int16_t vpx_rv[]) = {
+const int16_t vpx_rv[] = {
8, 5, 2, 2, 8, 12, 4, 9, 8, 3, 0, 3, 9, 0, 0, 0, 8, 3, 14,
4, 10, 1, 11, 14, 1, 14, 9, 6, 12, 11, 8, 6, 10, 0, 0, 8, 9, 0,
3, 14, 8, 11, 13, 4, 2, 9, 0, 3, 9, 6, 1, 2, 3, 14, 13, 1, 8,
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <assert.h>
-
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride) {
int i, j;
- /* comp_pred and pred must be 16 byte aligned. */
- assert(((intptr_t)comp_pred & 0xf) == 0);
- assert(((intptr_t)pred & 0xf) == 0);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; ++j) {
endif
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_txfm_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/fwd_dct32x32_impl_avx2.h
+DSP_SRCS-$(HAVE_NEON) += arm/fdct_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/fwd_txfm_neon.c
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.h
DSP_SRCS-$(HAVE_MSA) += mips/fwd_txfm_msa.c
DSP_SRCS-yes += variance.c
DSP_SRCS-yes += variance.h
+DSP_SRCS-$(HAVE_NEON) += arm/avg_pred_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
# Neon utilities
+DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h
DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h
# PPC VSX utilities
if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct4x4 sse2/;
+ specialize qw/vpx_fdct4x4 neon sse2/;
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct4x4_1 sse2/;
add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
} else {
add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct4x4 sse2 msa/;
+ specialize qw/vpx_fdct4x4 neon sse2 msa/;
add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct4x4_1 sse2/;
add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- specialize qw/vpx_highbd_idct4x4_1_add neon/;
+ specialize qw/vpx_highbd_idct4x4_1_add neon sse2/;
add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- specialize qw/vpx_highbd_idct8x8_1_add neon/;
+ specialize qw/vpx_highbd_idct8x8_1_add neon sse2/;
add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
- specialize qw/vpx_highbd_idct16x16_1_add neon/;
+ specialize qw/vpx_highbd_idct16x16_1_add neon sse2/;
add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
specialize qw/vpx_variance8x4 sse2 neon msa/;
add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-# TODO(johannkoenig): neon
- specialize qw/vpx_variance4x8 sse2 msa/;
+ specialize qw/vpx_variance4x8 sse2 neon msa/;
add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-# TODO(johannkoenig): neon
- specialize qw/vpx_variance4x4 sse2 msa/;
+ specialize qw/vpx_variance4x4 sse2 neon msa/;
#
# Specialty Variance
specialize qw/vpx_get4x4sse_cs neon msa vsx/;
add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
- specialize qw/vpx_comp_avg_pred sse2 vsx/;
+ specialize qw/vpx_comp_avg_pred neon sse2 vsx/;
#
# Subpixel Variance
specialize qw/vpx_sub_pixel_variance8x4 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance4x8 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance4x8 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
- specialize qw/vpx_sub_pixel_variance4x4 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_variance4x4 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa sse2 ssse3/;
add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
- specialize qw/vpx_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
+ specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa sse2 ssse3/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
}
}
}
+
+void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ highbd_idct_1_add_kernel(input, dest, stride, bd, 16);
+}
*/
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- __m128i dc_value, d;
- const __m128i zero = _mm_setzero_si128();
- const __m128i one = _mm_set1_epi16(1);
- const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
- int a, i, j;
- tran_low_t out;
-
- out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
- out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
- a = ROUND_POWER_OF_TWO(out, 6);
-
- d = _mm_set1_epi32(a);
- dc_value = _mm_packs_epi32(d, d);
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 4; ++j) {
- d = _mm_loadu_si128((const __m128i *)(&dest[j * 8]));
- d = _mm_adds_epi16(d, dc_value);
- d = _mm_max_epi16(d, zero);
- d = _mm_min_epi16(d, max);
- _mm_storeu_si128((__m128i *)(&dest[j * 8]), d);
- }
- dest += stride;
- }
+ highbd_idct_1_add_kernel(input, dest, stride, bd, 32);
}
#include "vpx_dsp/x86/transpose_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
+static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0,
+ const __m128i in1) {
+ const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 1
+ const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 2, 3
+ const __m128i t2 = _mm_unpacklo_epi64(t0, t1); // 0, 1, 2, 3
+ return dct_const_round_shift_sse2(t2);
+}
+
+static INLINE __m128i wraplow_16bit_sse2(const __m128i in0, const __m128i in1,
+ const __m128i rounding) {
+ __m128i temp[2];
+ temp[0] = _mm_add_epi32(in0, rounding);
+ temp[1] = _mm_add_epi32(in1, rounding);
+ temp[0] = _mm_srai_epi32(temp[0], 4);
+ temp[1] = _mm_srai_epi32(temp[1], 4);
+ return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE void highbd_idct4_small_sse2(__m128i *const io) {
+ const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0);
+ const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0);
+ const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0);
+ __m128i temp1[4], temp2[4], step[4];
+
+ transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]);
+
+ // Note: There is no 32-bit signed multiply SIMD instruction in SSE2.
+ // _mm_mul_epu32() is used which can only guarantee the lower 32-bit
+ // (signed) result is meaningful, which is enough in this function.
+
+ // stage 1
+ temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
+ temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
+ temp1[1] = _mm_srli_si128(temp1[0], 4); // 1, 3
+ temp2[1] = _mm_srli_si128(temp2[0], 4); // 1, 3
+ temp1[0] = _mm_mul_epu32(temp1[0], cospi_p16_p16); // ([0] + [2])*cospi_16_64
+ temp1[1] = _mm_mul_epu32(temp1[1], cospi_p16_p16); // ([0] + [2])*cospi_16_64
+ temp2[0] = _mm_mul_epu32(temp2[0], cospi_p16_p16); // ([0] - [2])*cospi_16_64
+ temp2[1] = _mm_mul_epu32(temp2[1], cospi_p16_p16); // ([0] - [2])*cospi_16_64
+ step[0] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+ step[1] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+ temp1[3] = _mm_srli_si128(io[1], 4);
+ temp2[3] = _mm_srli_si128(io[3], 4);
+ temp1[0] = _mm_mul_epu32(io[1], cospi_p24_p24); // input[1] * cospi_24_64
+ temp1[1] = _mm_mul_epu32(temp1[3], cospi_p24_p24); // input[1] * cospi_24_64
+ temp2[0] = _mm_mul_epu32(io[1], cospi_p08_p08); // input[1] * cospi_8_64
+ temp2[1] = _mm_mul_epu32(temp1[3], cospi_p08_p08); // input[1] * cospi_8_64
+ temp1[2] = _mm_mul_epu32(io[3], cospi_p08_p08); // input[3] * cospi_8_64
+ temp1[3] = _mm_mul_epu32(temp2[3], cospi_p08_p08); // input[3] * cospi_8_64
+ temp2[2] = _mm_mul_epu32(io[3], cospi_p24_p24); // input[3] * cospi_24_64
+ temp2[3] = _mm_mul_epu32(temp2[3], cospi_p24_p24); // input[3] * cospi_24_64
+ temp1[0] = _mm_sub_epi64(temp1[0], temp1[2]); // [1]*cospi_24 - [3]*cospi_8
+ temp1[1] = _mm_sub_epi64(temp1[1], temp1[3]); // [1]*cospi_24 - [3]*cospi_8
+ temp2[0] = _mm_add_epi64(temp2[0], temp2[2]); // [1]*cospi_8 + [3]*cospi_24
+ temp2[1] = _mm_add_epi64(temp2[1], temp2[3]); // [1]*cospi_8 + [3]*cospi_24
+ step[2] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+ step[3] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+ // stage 2
+ io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
+ io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
+ io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2]
+ io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
+}
+
+static INLINE void abs_extend_64bit_sse2(const __m128i in,
+ __m128i *const out /*out[2]*/,
+ __m128i *const sign /*sign[2]*/) {
+ sign[0] = _mm_srai_epi32(in, 31);
+ out[0] = _mm_xor_si128(in, sign[0]);
+ out[0] = _mm_sub_epi32(out[0], sign[0]);
+ sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]); // 64-bit sign of 2, 3
+ sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]); // 64-bit sign of 0, 1
+ out[1] = _mm_unpackhi_epi32(out[0], out[0]); // 2, 3
+ out[0] = _mm_unpacklo_epi32(out[0], out[0]); // 0, 1
+}
+
+static INLINE __m128i multiply_apply_sign_sse2(const __m128i in,
+ const __m128i sign,
+ const __m128i cospi) {
+ __m128i out = _mm_mul_epu32(in, cospi);
+ out = _mm_xor_si128(out, sign);
+ return _mm_sub_epi64(out, sign);
+}
+
+static INLINE __m128i dct_const_round_shift_64bit_sse2(const __m128i in) {
+ const __m128i t = _mm_add_epi64(
+ in,
+ _mm_setr_epi32(DCT_CONST_ROUNDING << 2, 0, DCT_CONST_ROUNDING << 2, 0));
+ return _mm_srli_si128(t, 2);
+}
+
+static INLINE __m128i pack_4_sse2(const __m128i in0, const __m128i in1) {
+ const __m128i t0 = _mm_unpacklo_epi32(in0, in1); // 0, 2
+ const __m128i t1 = _mm_unpackhi_epi32(in0, in1); // 1, 3
+ return _mm_unpacklo_epi32(t0, t1); // 0, 1, 2, 3
+}
+
+static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
+ const __m128i cospi_p16_p16 =
+ _mm_setr_epi32(cospi_16_64 << 2, 0, cospi_16_64 << 2, 0);
+ const __m128i cospi_p08_p08 =
+ _mm_setr_epi32(cospi_8_64 << 2, 0, cospi_8_64 << 2, 0);
+ const __m128i cospi_p24_p24 =
+ _mm_setr_epi32(cospi_24_64 << 2, 0, cospi_24_64 << 2, 0);
+ __m128i temp1[4], temp2[4], step[4], sign1[4], sign2[4];
+
+ transpose_32bit_4x4(&io[0], &io[1], &io[2], &io[3]);
+
+ // stage 1
+ temp1[0] = _mm_add_epi32(io[0], io[2]); // input[0] + input[2]
+ temp2[0] = _mm_sub_epi32(io[0], io[2]); // input[0] - input[2]
+ abs_extend_64bit_sse2(temp1[0], temp1, sign1);
+ abs_extend_64bit_sse2(temp2[0], temp2, sign2);
+ temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], cospi_p16_p16);
+ temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p16_p16);
+ temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p16_p16);
+ temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p16_p16);
+ temp1[0] = dct_const_round_shift_64bit_sse2(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit_sse2(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit_sse2(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit_sse2(temp2[1]);
+ step[0] = pack_4_sse2(temp1[0], temp1[1]);
+ step[1] = pack_4_sse2(temp2[0], temp2[1]);
+
+ abs_extend_64bit_sse2(io[1], temp1, sign1);
+ abs_extend_64bit_sse2(io[3], temp2, sign2);
+ temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], cospi_p08_p08);
+ temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p08_p08);
+ temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], cospi_p24_p24);
+ temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], cospi_p24_p24);
+ temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p24_p24);
+ temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p24_p24);
+ temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], cospi_p08_p08);
+ temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], cospi_p08_p08);
+ temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]); // [1]*cospi_24 - [3]*cospi_8
+ temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]); // [1]*cospi_24 - [3]*cospi_8
+ temp2[0] = _mm_add_epi64(temp1[2], temp2[2]); // [1]*cospi_8 + [3]*cospi_24
+ temp2[1] = _mm_add_epi64(temp1[3], temp2[3]); // [1]*cospi_8 + [3]*cospi_24
+ temp1[0] = dct_const_round_shift_64bit_sse2(temp1[0]);
+ temp1[1] = dct_const_round_shift_64bit_sse2(temp1[1]);
+ temp2[0] = dct_const_round_shift_64bit_sse2(temp2[0]);
+ temp2[1] = dct_const_round_shift_64bit_sse2(temp2[1]);
+ step[2] = pack_4_sse2(temp1[0], temp1[1]);
+ step[3] = pack_4_sse2(temp2[0], temp2[1]);
+
+ // stage 2
+ io[0] = _mm_add_epi32(step[0], step[3]); // step[0] + step[3]
+ io[1] = _mm_add_epi32(step[1], step[2]); // step[1] + step[2]
+ io[2] = _mm_sub_epi32(step[1], step[2]); // step[1] - step[2]
+ io[3] = _mm_sub_epi32(step[0], step[3]); // step[0] - step[3]
+}
+
void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
int stride, int bd) {
- tran_low_t out[4 * 4];
- tran_low_t *outptr = out;
- int i, j;
- __m128i inptr[4];
- __m128i sign_bits[2];
- __m128i temp_mm, min_input, max_input;
- int test;
- int optimised_cols = 0;
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i eight = _mm_set1_epi16(8);
- const __m128i max = _mm_set1_epi16(12043);
- const __m128i min = _mm_set1_epi16(-12043);
- // Load input into __m128i
- inptr[0] = _mm_loadu_si128((const __m128i *)input);
- inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
- inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
- inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
- // Pack to 16 bits
- inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
- inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp_mm = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp_mm);
-
- if (!test) {
- // Do the row transform
- idct4_sse2(inptr);
-
- // Check the min & max values
- max_input = _mm_max_epi16(inptr[0], inptr[1]);
- min_input = _mm_min_epi16(inptr[0], inptr[1]);
- max_input = _mm_cmpgt_epi16(max_input, max);
- min_input = _mm_cmplt_epi16(min_input, min);
- temp_mm = _mm_or_si128(max_input, min_input);
- test = _mm_movemask_epi8(temp_mm);
-
- if (test) {
- transpose_4x4(inptr);
- sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
- sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
- inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
- inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
- inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
- inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
- _mm_storeu_si128((__m128i *)outptr, inptr[0]);
- _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
- _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
- _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
- } else {
- // Set to use the optimised transform for the column
- optimised_cols = 1;
- }
- } else {
- // Run the un-optimised row transform
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct4_c(input, outptr, bd);
- input += 4;
- outptr += 4;
- }
+ int16_t max = 0, min = 0;
+ __m128i io[4], io_short[2];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0));
+ io[1] = _mm_load_si128((const __m128i *)(input + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 8));
+ io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+ io_short[0] = _mm_packs_epi32(io[0], io[1]);
+ io_short[1] = _mm_packs_epi32(io[2], io[3]);
+
+ if (bd != 8) {
+ __m128i max_input, min_input;
+
+ max_input = _mm_max_epi16(io_short[0], io_short[1]);
+ min_input = _mm_min_epi16(io_short[0], io_short[1]);
+ max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 8));
+ min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 8));
+ max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 4));
+ min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 4));
+ max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 2));
+ min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 2));
+ max = _mm_extract_epi16(max_input, 0);
+ min = _mm_extract_epi16(min_input, 0);
}
- if (optimised_cols) {
- idct4_sse2(inptr);
-
- // Final round and shift
- inptr[0] = _mm_add_epi16(inptr[0], eight);
- inptr[1] = _mm_add_epi16(inptr[1], eight);
-
- inptr[0] = _mm_srai_epi16(inptr[0], 4);
- inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
- // Reconstruction and Store
- {
- __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
- __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
- d0 = _mm_unpacklo_epi64(
- d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
- d2 = _mm_unpacklo_epi64(
- d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
- d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
- d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
- // store input0
- _mm_storel_epi64((__m128i *)dest, d0);
- // store input1
- d0 = _mm_srli_si128(d0, 8);
- _mm_storel_epi64((__m128i *)(dest + stride), d0);
- // store input2
- _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
- // store input3
- d2 = _mm_srli_si128(d2, 8);
- _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
- }
+ if (bd == 8 || (max < 4096 && min >= -4096)) {
+ idct4_sse2(io_short);
+ idct4_sse2(io_short);
+ io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+ io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+ io[0] = _mm_srai_epi16(io_short[0], 4);
+ io[1] = _mm_srai_epi16(io_short[1], 4);
} else {
- // Run the un-optimised column transform
- tran_low_t temp_in[4], temp_out[4];
- // Columns
- for (i = 0; i < 4; ++i) {
- for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
- vpx_highbd_idct4_c(temp_in, temp_out, bd);
- for (j = 0; j < 4; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
- }
+ if (max < 32767 && min > -32768) {
+ highbd_idct4_small_sse2(io);
+ highbd_idct4_small_sse2(io);
+ } else {
+ highbd_idct4_large_sse2(io);
+ highbd_idct4_large_sse2(io);
}
+ io[0] = wraplow_16bit_sse2(io[0], io[1], _mm_set1_epi32(8));
+ io[1] = wraplow_16bit_sse2(io[2], io[3], _mm_set1_epi32(8));
+ }
+
+ // Reconstruction and Store
+ {
+ __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
+ __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
+ d0 = _mm_unpacklo_epi64(d0,
+ _mm_loadl_epi64((const __m128i *)(dest + stride)));
+ d2 = _mm_unpacklo_epi64(
+ d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
+ d0 = clamp_high_sse2(_mm_adds_epi16(d0, io[0]), bd);
+ d2 = clamp_high_sse2(_mm_adds_epi16(d2, io[1]), bd);
+ // store input0
+ _mm_storel_epi64((__m128i *)dest, d0);
+ // store input1
+ d0 = _mm_srli_si128(d0, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride), d0);
+ // store input2
+ _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
+ // store input3
+ d2 = _mm_srli_si128(d2, 8);
+ _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
+ }
+}
+
+void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ // Faster than _mm_set1_epi16((1 << bd) - 1).
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ int a1, i;
+ tran_low_t out;
+ __m128i dc, d;
+
+ out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+ dc = _mm_set1_epi16(a1);
+
+ for (i = 0; i < 4; ++i) {
+ d = _mm_loadl_epi64((const __m128i *)dest);
+ d = add_dc_clamp(&zero, &max, &dc, &d);
+ _mm_storel_epi64((__m128i *)dest, d);
+ dest += stride;
}
}
}
}
}
+
+void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
+ int stride, int bd) {
+ highbd_idct_1_add_kernel(input, dest, stride, bd, 8);
+}
#include "vpx_dsp/inv_txfm.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
+static INLINE __m128i add_dc_clamp(const __m128i *const min,
+ const __m128i *const max,
+ const __m128i *const dc,
+ const __m128i *const in) {
+ __m128i out;
+ out = _mm_adds_epi16(*in, *dc);
+ out = _mm_max_epi16(out, *min);
+ out = _mm_min_epi16(out, *max);
+ return out;
+}
+
+static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
+ uint16_t *dest, int stride, int bd,
+ const int size) {
+ const __m128i zero = _mm_setzero_si128();
+ // Faster than _mm_set1_epi16((1 << bd) - 1).
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+ int a1, i, j;
+ tran_low_t out;
+ __m128i dc, d;
+
+ out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+ a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6);
+ dc = _mm_set1_epi16(a1);
+
+ for (i = 0; i < size; ++i) {
+ for (j = 0; j < (size >> 3); ++j) {
+ d = _mm_load_si128((const __m128i *)(&dest[j * 8]));
+ d = add_dc_clamp(&zero, &max, &dc, &d);
+ _mm_store_si128((__m128i *)(&dest[j * 8]), d);
+ }
+ dest += stride;
+ }
+}
+
static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
__m128i ubounded, retval;
const __m128i zero = _mm_set1_epi16(0);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u[8], v[8];
+ __m128i u[2];
- transpose_4x4(in);
+ transpose_16bit_4x4(in);
// stage 1
u[0] = _mm_unpacklo_epi16(in[0], in[1]);
u[1] = _mm_unpackhi_epi16(in[0], in[1]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
- u[0] = _mm_packs_epi32(v[0], v[1]);
- u[1] = _mm_packs_epi32(v[3], v[2]);
+ u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]);
+ u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]);
// stage 2
in[0] = _mm_add_epi16(u[0], u[1]);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8], in7;
- transpose_4x4(in);
+ transpose_16bit_4x4(in);
in7 = _mm_srli_si128(in[1], 8);
in7 = _mm_add_epi16(in7, in[0]);
in7 = _mm_sub_epi16(in7, in[1]);
#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
{ \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
+ res0 = idct_calc_wraplow_sse2(lo_0, hi_0, cst0); \
+ res1 = idct_calc_wraplow_sse2(lo_0, hi_0, cst1); \
}
#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
\
- tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
- tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
- tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
- tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ stp1_5 = idct_calc_wraplow_sse2(lo_56, hi_56, stg2_1); \
+ stp1_6 = idct_calc_wraplow_sse2(lo_56, hi_56, stg2_0); \
} \
\
/* Stage4 */ \
void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
__m128i in0, in1, in2, in3, in4, in5, in6, in7;
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
// Load input data.
}
void idct8_sse2(__m128i *in) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
__m128i in0, in1, in2, in3, in4, in5, in6, in7;
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
// 8x8 Transpose is copied from vpx_fdct8x8_sse2()
TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
u2 = _mm_unpacklo_epi16(s6, s7);
u3 = _mm_unpackhi_epi16(s6, s7);
- v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
- v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
- v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
- v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- s2 = _mm_packs_epi32(v0, v1);
- s3 = _mm_packs_epi32(v2, v3);
- s6 = _mm_packs_epi32(v4, v5);
- s7 = _mm_packs_epi32(v6, v7);
+ s2 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_p16);
+ s3 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_m16);
+ s6 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_p16);
+ s7 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_m16);
in[0] = s0;
in[1] = _mm_sub_epi16(k__const_0, s4);
void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1 << 4);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
__m128i in0, in1, in2, in3, in4, in5, in6, in7;
__m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i tmp0, tmp1, tmp2, tmp3;
// Rows. Load 4-row input data.
in0 = load_input_data(input);
const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
- tmp0 = _mm_madd_epi16(lo_17, stg1_0);
- tmp2 = _mm_madd_epi16(lo_17, stg1_1);
- tmp4 = _mm_madd_epi16(lo_35, stg1_2);
- tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp1_4 = _mm_packs_epi32(tmp0, tmp2);
- stp1_5 = _mm_packs_epi32(tmp4, tmp6);
+ stp1_4 = idct_calc_wraplow_sse2(stg1_0, stg1_1, lo_17);
+ stp1_5 = idct_calc_wraplow_sse2(stg1_2, stg1_3, lo_35);
}
// Stage2
const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
- tmp0 = _mm_madd_epi16(lo_04, stg2_0);
- tmp2 = _mm_madd_epi16(lo_04, stg2_1);
- tmp4 = _mm_madd_epi16(lo_26, stg2_2);
- tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp2_0 = _mm_packs_epi32(tmp0, tmp2);
- stp2_2 = _mm_packs_epi32(tmp6, tmp4);
+ stp2_0 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_04);
+ stp2_2 = idct_calc_wraplow_sse2(stg2_3, stg2_2, lo_26);
tmp0 = _mm_add_epi16(stp1_4, stp1_5);
tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
{
const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
- tmp4 = _mm_add_epi16(stp2_0, stp2_2);
- tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
-
- stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
- stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
-
- tmp0 = _mm_madd_epi16(lo_56, stg3_0);
- tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
- stp1_5 = _mm_packs_epi32(tmp0, tmp2);
+ tmp0 = _mm_add_epi16(stp2_0, stp2_2);
+ tmp1 = _mm_sub_epi16(stp2_0, stp2_2);
+ stp1_2 = _mm_unpackhi_epi64(tmp1, tmp0);
+ stp1_3 = _mm_unpacklo_epi64(tmp1, tmp0);
+ stp1_5 = idct_calc_wraplow_sse2(stg3_0, stg2_0, lo_56); // stg3_1 = stg2_0
}
// Stage4
stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
\
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1); \
+ stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0); \
\
stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
stp1_2 = stp1_1; \
stp1_3 = stp1_0; \
\
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1); \
+ stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0); \
\
stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1 << 5);
const __m128i zero = _mm_setzero_si128();
stp1_8_0, stp1_12_0;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
curr1 = l;
u[6] = _mm_unpacklo_epi16(s[14], s[15]);
u[7] = _mm_unpackhi_epi16(s[14], s[15]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+ in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16);
+ in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+ in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
+ in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
+ in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16);
+ in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16);
+ in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16);
+ in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16);
in[0] = s[0];
in[1] = _mm_sub_epi16(kZero, s[8]);
in[2] = s[12];
in[3] = _mm_sub_epi16(kZero, s[4]);
- in[4] = _mm_packs_epi32(v[4], v[5]);
- in[5] = _mm_packs_epi32(v[12], v[13]);
- in[6] = _mm_packs_epi32(v[8], v[9]);
- in[7] = _mm_packs_epi32(v[0], v[1]);
- in[8] = _mm_packs_epi32(v[2], v[3]);
- in[9] = _mm_packs_epi32(v[10], v[11]);
- in[10] = _mm_packs_epi32(v[14], v[15]);
- in[11] = _mm_packs_epi32(v[6], v[7]);
in[12] = s[5];
in[13] = _mm_sub_epi16(kZero, s[13]);
in[14] = s[9];
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i v[16], u[16], s[16], t[16];
+ __m128i u[16], s[16], t[16];
// stage 1
s[0] = in[0];
u[6] = _mm_unpacklo_epi16(s[11], s[12]);
u[7] = _mm_unpackhi_epi16(s[11], s[12]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[8] = _mm_packs_epi32(u[0], u[1]);
- s[15] = _mm_packs_epi32(u[2], u[3]);
- s[9] = _mm_packs_epi32(u[4], u[5]);
- s[14] = _mm_packs_epi32(u[6], u[7]);
- s[10] = _mm_packs_epi32(u[8], u[9]);
- s[13] = _mm_packs_epi32(u[10], u[11]);
- s[11] = _mm_packs_epi32(u[12], u[13]);
- s[12] = _mm_packs_epi32(u[14], u[15]);
+ s[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p30_m02);
+ s[15] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p02_p30);
+ s[9] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p14_m18);
+ s[14] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p18_p14);
+ s[10] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p22_m10);
+ s[13] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p10_p22);
+ s[11] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p06_m26);
+ s[12] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p26_p06);
// stage 3
t[0] = s[0];
u[2] = _mm_unpacklo_epi16(s[5], s[6]);
u[3] = _mm_unpackhi_epi16(s[5], s[6]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[4] = _mm_packs_epi32(u[0], u[1]);
- t[7] = _mm_packs_epi32(u[2], u[3]);
- t[5] = _mm_packs_epi32(u[4], u[5]);
- t[6] = _mm_packs_epi32(u[6], u[7]);
+ t[4] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p28_m04);
+ t[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p04_p28);
+ t[5] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p12_m20);
+ t[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p20_p12);
t[8] = _mm_add_epi16(s[8], s[9]);
t[9] = _mm_sub_epi16(s[8], s[9]);
t[10] = _mm_sub_epi16(s[11], s[10]);
u[6] = _mm_unpacklo_epi16(t[10], t[13]);
u[7] = _mm_unpackhi_epi16(t[10], t[13]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[0] = _mm_packs_epi32(u[0], u[1]);
- s[1] = _mm_packs_epi32(u[2], u[3]);
- s[2] = _mm_packs_epi32(u[4], u[5]);
- s[3] = _mm_packs_epi32(u[6], u[7]);
+ s[0] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
+ s[1] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+ s[2] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p24_m08);
+ s[3] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p08_p24);
+ s[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m08_p24);
+ s[14] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p24_p08);
+ s[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m24_m08);
+ s[13] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m08_p24);
s[4] = _mm_add_epi16(t[4], t[5]);
s[5] = _mm_sub_epi16(t[4], t[5]);
s[6] = _mm_sub_epi16(t[7], t[6]);
s[7] = _mm_add_epi16(t[6], t[7]);
s[8] = t[8];
s[15] = t[15];
- s[9] = _mm_packs_epi32(u[8], u[9]);
- s[14] = _mm_packs_epi32(u[10], u[11]);
- s[10] = _mm_packs_epi32(u[12], u[13]);
- s[13] = _mm_packs_epi32(u[14], u[15]);
s[11] = t[11];
s[12] = t[12];
u[0] = _mm_unpacklo_epi16(s[5], s[6]);
u[1] = _mm_unpackhi_epi16(s[5], s[6]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- t[5] = _mm_packs_epi32(u[0], u[1]);
- t[6] = _mm_packs_epi32(u[2], u[3]);
+ t[5] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_p16);
+ t[6] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
t[8] = _mm_add_epi16(s[8], s[11]);
t[9] = _mm_add_epi16(s[9], s[10]);
u[2] = _mm_unpacklo_epi16(t[11], t[12]);
u[3] = _mm_unpackhi_epi16(t[11], t[12]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- s[10] = _mm_packs_epi32(u[0], u[1]);
- s[13] = _mm_packs_epi32(u[2], u[3]);
- s[11] = _mm_packs_epi32(u[4], u[5]);
- s[12] = _mm_packs_epi32(u[6], u[7]);
+ s[10] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_p16);
+ s[13] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
+ s[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
+ s[12] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
s[14] = t[14];
s[15] = t[15];
void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1 << 5);
const __m128i zero = _mm_setzero_si128();
stp1_12_0;
__m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ __m128i tmp0, tmp1, tmp2, tmp3;
int i;
// First 1-D inverse DCT
// Load input data.
const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
- tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
- tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
- tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
- tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
- tmp7 = _mm_add_epi32(tmp7, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
- stp2_8 = _mm_packs_epi32(tmp0, tmp2);
- stp2_11 = _mm_packs_epi32(tmp5, tmp7);
+ stp2_8 = idct_calc_wraplow_sse2(stg2_0, stg2_1, lo_1_15);
+ stp2_11 = idct_calc_wraplow_sse2(stg2_6, stg2_7, lo_13_3);
}
// Stage3
{
const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
- tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
- tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
+ stp1_4 = idct_calc_wraplow_sse2(stg3_0, stg3_1, lo_2_14);
stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
-
- stp1_4 = _mm_packs_epi32(tmp0, tmp2);
}
// Stage4
const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
- tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
- tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
- tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
- tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
- tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
- tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp5 = _mm_add_epi32(tmp5, rounding);
- tmp7 = _mm_add_epi32(tmp7, rounding);
-
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
+ tmp0 = idct_madd_round_shift_sse2(lo_0_8, stg4_0);
+ tmp1 = idct_madd_round_shift_sse2(lo_0_8, stg4_1);
stp1_0 = _mm_packs_epi32(tmp0, tmp0);
- stp1_1 = _mm_packs_epi32(tmp2, tmp2);
- stp2_9 = _mm_packs_epi32(tmp1, tmp3);
- stp2_10 = _mm_packs_epi32(tmp5, tmp7);
+ stp1_1 = _mm_packs_epi32(tmp1, tmp1);
+ stp2_9 = idct_calc_wraplow_sse2(stg4_4, stg4_5, lo_9_14);
+ stp2_10 = idct_calc_wraplow_sse2(stg4_6, stg4_7, lo_10_13);
stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
}
const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
- tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
- tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
- tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
- tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
- tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
- tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
- tmp1 = _mm_add_epi32(tmp1, rounding);
- tmp3 = _mm_add_epi32(tmp3, rounding);
- tmp0 = _mm_add_epi32(tmp0, rounding);
- tmp2 = _mm_add_epi32(tmp2, rounding);
- tmp4 = _mm_add_epi32(tmp4, rounding);
- tmp6 = _mm_add_epi32(tmp6, rounding);
-
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
- stp1_6 = _mm_packs_epi32(tmp3, tmp1);
+ stp1_6 = idct_calc_wraplow_sse2(stg4_0, stg4_1, lo_6_5);
+ tmp0 = idct_madd_round_shift_sse2(lo_10_13, stg6_0);
+ tmp1 = idct_madd_round_shift_sse2(lo_10_13, stg4_0);
+ tmp2 = idct_madd_round_shift_sse2(lo_11_12, stg6_0);
+ tmp3 = idct_madd_round_shift_sse2(lo_11_12, stg4_0);
stp2_10 = _mm_packs_epi32(tmp0, zero);
- stp2_13 = _mm_packs_epi32(tmp2, zero);
- stp2_11 = _mm_packs_epi32(tmp4, zero);
- stp2_12 = _mm_packs_epi32(tmp6, zero);
+ stp2_13 = _mm_packs_epi32(tmp1, zero);
+ stp2_11 = _mm_packs_epi32(tmp2, zero);
+ stp2_12 = _mm_packs_epi32(tmp3, zero);
tmp0 = _mm_add_epi16(stp1_0, stp1_4);
tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
stp1_2 = stp2_1; \
stp1_3 = stp2_0; \
\
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1); \
+ stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0); \
\
stp1_4 = stp2_4; \
stp1_7 = stp2_7; \
stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
\
- tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
- tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
- tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
- tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- \
- stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
- stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ stp1_5 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_1); \
+ stp1_6 = idct_calc_wraplow_sse2(lo_6_5, hi_6_5, stg4_0); \
\
stp1_4 = stp2_4; \
stp1_7 = stp2_7; \
void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
const __m128i zero = _mm_setzero_si128();
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1 << 5);
// idct constants for each stage
stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i;
// Load input data. Only need to load the top left 8x8 block.
void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
int stride) {
- const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1 << 5);
const __m128i zero = _mm_setzero_si128();
stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
- __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
int i, j, i32;
for (i = 0; i < 4; i++) {
res0[15] = tbuf[7];
}
+static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) {
+ const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING));
+ return _mm_srai_epi32(t, DCT_CONST_BITS);
+}
+
+static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in,
+ const __m128i cospi) {
+ const __m128i t = _mm_madd_epi16(in, cospi);
+ return dct_const_round_shift_sse2(t);
+}
+
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0,
+ const __m128i in1,
+ const __m128i x) {
+ const __m128i t0 = idct_madd_round_shift_sse2(in0, x);
+ const __m128i t1 = idct_madd_round_shift_sse2(in1, x);
+ return _mm_packs_epi32(t0, t1);
+}
+
// Function to allow 8 bit optimisations to be used when profile 0 is used with
// highbitdepth enabled
static INLINE __m128i load_input_data(const tran_low_t *data) {
#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
res0, res1, res2, res3) \
{ \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- tmp4 = _mm_madd_epi16(lo_1, cst2); \
- tmp5 = _mm_madd_epi16(hi_1, cst2); \
- tmp6 = _mm_madd_epi16(lo_1, cst3); \
- tmp7 = _mm_madd_epi16(hi_1, cst3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
- res2 = _mm_packs_epi32(tmp4, tmp5); \
- res3 = _mm_packs_epi32(tmp6, tmp7); \
+ res0 = idct_calc_wraplow_sse2(lo_0, hi_0, cst0); \
+ res1 = idct_calc_wraplow_sse2(lo_0, hi_0, cst1); \
+ res2 = idct_calc_wraplow_sse2(lo_1, hi_1, cst2); \
+ res3 = idct_calc_wraplow_sse2(lo_1, hi_1, cst3); \
}
static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
#include "vpx_dsp/x86/inv_txfm_sse2.h"
#include "vpx_dsp/x86/txfm_common_sse2.h"
-static INLINE void transpose_4x4(__m128i *res) {
+static INLINE void transpose_16bit_4x4(__m128i *res) {
const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
}
+static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1,
+ __m128i *const a2, __m128i *const a3) {
+ // Unpack 32 bit elements. Goes from:
+ // a0: 00 01 02 03
+ // a1: 10 11 12 13
+ // a2: 20 21 22 23
+ // a3: 30 31 32 33
+ // to:
+ // b0: 00 10 01 11
+ // b1: 20 30 21 31
+ // b2: 02 12 03 13
+ // b3: 22 32 23 33
+
+ const __m128i b0 = _mm_unpacklo_epi32(*a0, *a1);
+ const __m128i b1 = _mm_unpacklo_epi32(*a2, *a3);
+ const __m128i b2 = _mm_unpackhi_epi32(*a0, *a1);
+ const __m128i b3 = _mm_unpackhi_epi32(*a2, *a3);
+
+ // Unpack 64 bit elements resulting in:
+ // a0: 00 10 20 30
+ // a1: 01 11 21 31
+ // a2: 02 12 22 32
+ // a3: 03 13 23 33
+ *a0 = _mm_unpacklo_epi64(b0, b1);
+ *a1 = _mm_unpackhi_epi64(b0, b1);
+ *a2 = _mm_unpacklo_epi64(b2, b3);
+ *a3 = _mm_unpackhi_epi64(b2, b3);
+}
+
#endif // VPX_DSP_X86_TRANSPOSE_SSE2_H_
#define DECLARE_ALIGNED(n, typ, val) typ val
#endif
-#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32)
-#define DECLARE_PROTECTED(decl) decl __attribute__((visibility("protected")))
-#else
-#define DECLARE_PROTECTED(decl) decl
-#endif
-
#if HAVE_NEON && defined(_MSC_VER)
#define __builtin_prefetch(x)
#endif
assert(ybf->y_height - ybf->y_crop_height >= 0);
assert(ybf->y_width - ybf->y_crop_width >= 0);
-#if CONFIG_VP9_HIGHBITDEPTH
- if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
- extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
- ybf->y_crop_height, ybf->border, ybf->border,
- ybf->border + ybf->y_height - ybf->y_crop_height,
- ybf->border + ybf->y_width - ybf->y_crop_width);
-
- extend_plane_high(ybf->u_buffer, ybf->uv_stride, ybf->uv_crop_width,
- ybf->uv_crop_height, uv_border, uv_border,
- uv_border + ybf->uv_height - ybf->uv_crop_height,
- uv_border + ybf->uv_width - ybf->uv_crop_width);
-
- extend_plane_high(ybf->v_buffer, ybf->uv_stride, ybf->uv_crop_width,
- ybf->uv_crop_height, uv_border, uv_border,
- uv_border + ybf->uv_height - ybf->uv_crop_height,
- uv_border + ybf->uv_width - ybf->uv_crop_width);
- return;
- }
-#endif
extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
ybf->y_crop_height, ybf->border, ybf->border,
ybf->border + ybf->y_height - ybf->y_crop_height,
// Copies the source image into the destination image and updates the
// destination's UMV borders.
// Note: The frames are assumed to be identical in size.
+
void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
YV12_BUFFER_CONFIG *dst_ybc) {
int row;
const uint8_t *src = src_ybc->y_buffer;
uint8_t *dst = dst_ybc->y_buffer;
+#if 0
+ /* These assertions are valid in the codec, but the libvpx-tester uses
+ * this code slightly differently.
+ */
+ assert(src_ybc->y_width == dst_ybc->y_width);
+ assert(src_ybc->y_height == dst_ybc->y_height);
+#endif
+
+ for (row = 0; row < src_ybc->y_height; ++row) {
+ memcpy(dst, src, src_ybc->y_width);
+ src += src_ybc->y_stride;
+ dst += dst_ybc->y_stride;
+ }
+
+ src = src_ybc->u_buffer;
+ dst = dst_ybc->u_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; ++row) {
+ memcpy(dst, src, src_ybc->uv_width);
+ src += src_ybc->uv_stride;
+ dst += dst_ybc->uv_stride;
+ }
+
+ src = src_ybc->v_buffer;
+ dst = dst_ybc->v_buffer;
+
+ for (row = 0; row < src_ybc->uv_height; ++row) {
+ memcpy(dst, src, src_ybc->uv_width);
+ src += src_ybc->uv_stride;
+ dst += dst_ybc->uv_stride;
+ }
+
+ vp8_yv12_extend_frame_borders_c(dst_ybc);
+}
+
+#if CONFIG_VP9
+void vpx_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
+ YV12_BUFFER_CONFIG *dst_ybc) {
+ int row;
+ const uint8_t *src = src_ybc->y_buffer;
+ uint8_t *dst = dst_ybc->y_buffer;
+
#if 0
/* These assertions are valid in the codec, but the libvpx-tester uses
* this code slightly differently.
dst += dst_ybc->uv_stride;
}
- vp8_yv12_extend_frame_borders_c(dst_ybc);
+ vpx_extend_frame_borders_c(dst_ybc);
return;
} else {
assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH));
dst += dst_ybc->uv_stride;
}
- vp8_yv12_extend_frame_borders_c(dst_ybc);
+ vpx_extend_frame_borders_c(dst_ybc);
}
+#endif // CONFIG_VP9
void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
YV12_BUFFER_CONFIG *dst_ybc) {
add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
if (vpx_config("CONFIG_VP9") eq "yes") {
+ add_proto qw/void vpx_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+
add_proto qw/void vpx_extend_frame_borders/, "struct yv12_buffer_config *ybf";
specialize qw/vpx_extend_frame_borders dspr2/;