From f76ccce5bc69c857d7dbf9a8b64a4fafbd8ca3ca Mon Sep 17 00:00:00 2001 From: Marco Paniconi Date: Wed, 15 Apr 2015 17:48:20 -0700 Subject: [PATCH] Revert "Revert "Force_split on 16x16 blocks in variance partition."" This reverts commit 004b9d83e37d355f590a6976a27b7b845d19a869 Change-Id: I2f2d0bdb9368c2c07f1d29a69cd461267a3a8743 --- vp9/common/vp9_rtcd_defs.pl | 5 + vp9/encoder/vp9_avg.c | 30 ++++++ vp9/encoder/vp9_encodeframe.c | 134 ++++++++++++++++++++++---- vp9/encoder/vp9_encoder.h | 2 + vp9/encoder/x86/vp9_avg_intrin_sse2.c | 77 +++++++++++++++ 5 files changed, 228 insertions(+), 20 deletions(-) diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index c8628f84f..537cc6c4c 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -1114,6 +1114,9 @@ specialize qw/vp9_avg_8x8 sse2 neon/; add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p"; specialize qw/vp9_avg_4x4 sse2/; +add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; +specialize qw/vp9_minmax_8x8 sse2/; + add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff"; specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64"; @@ -1137,6 +1140,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_highbd_avg_8x8/; add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p"; specialize qw/vp9_highbd_avg_4x4/; + add_proto qw/unsigned int vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; + specialize qw/vp9_highbd_minmax_8x8/; } # ENCODEMB INVOKE diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c index 58daa3ad4..e26a03cef 100644 --- a/vp9/encoder/vp9_avg.c +++ b/vp9/encoder/vp9_avg.c @@ -155,6 +155,20 @@ int vp9_vector_var_c(int16_t const *ref, int16_t const *src, return var; } +void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + int i, j; + *min = 255; + *max = 0; + for (i = 0; i < 8; ++i, s += p, d += dp) { + for (j = 0; j < 8; ++j) { + int diff = abs(s[j]-d[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} + #if CONFIG_VP9_HIGHBITDEPTH unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) { int i, j; @@ -175,6 +189,22 @@ unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) { return (sum + 8) >> 4; } + +void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, + int dp, int *min, int *max) { + int i, j; + *min = 255; + *max = 0; + const uint16_t* s = CONVERT_TO_SHORTPTR(s8); + const uint16_t* d = CONVERT_TO_SHORTPTR(d8); + for (i = 0; i < 8; ++i, s += p, d += dp) { + for (j = 0; j < 8; ++j) { + int diff = abs(s[j]-d[j]); + *min = diff < *min ? diff : *min; + *max = diff > *max ? diff : *max; + } + } +} #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index e59d2c2a0..d4281751b 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -390,18 +390,21 @@ static int set_vt_partitioning(VP9_COMP *cpi, variance_node vt; const int block_width = num_8x8_blocks_wide_lookup[bsize]; const int block_height = num_8x8_blocks_high_lookup[bsize]; + const int low_res = (cm->width <= 352 && cm->height <= 288); assert(block_height == block_width); tree_to_node(data, bsize, &vt); - if (force_split) + if (force_split == 1) return 0; // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if // variance is below threshold, otherwise split will be selected. // No check for vert/horiz split as too few samples for variance. if (bsize == bsize_min) { - get_variance(&vt.part_variances->none); + // Variance already computed to set the force_split. + if (low_res || cm->frame_type == KEY_FRAME) + get_variance(&vt.part_variances->none); if (mi_col + block_width / 2 < cm->mi_cols && mi_row + block_height / 2 < cm->mi_rows && vt.part_variances->none.variance < threshold) { @@ -410,11 +413,10 @@ static int set_vt_partitioning(VP9_COMP *cpi, } return 0; } else if (bsize > bsize_min) { - // Variance is already computed for 32x32 blocks to set the force_split. - if (bsize != BLOCK_32X32) + // Variance already computed to set the force_split. + if (low_res || cm->frame_type == KEY_FRAME) get_variance(&vt.part_variances->none); - // For key frame or low_res: for bsize above 32X32 or very high variance, - // take split. + // For key frame: take split for bsize above 32X32 or very high variance. if (cm->frame_type == KEY_FRAME && (bsize > BLOCK_32X32 || vt.part_variances->none.variance > (threshold << 4))) { @@ -483,19 +485,66 @@ void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) { cpi->vbp_thresholds[1] = threshold_base >> 2; cpi->vbp_thresholds[2] = threshold_base >> 2; cpi->vbp_thresholds[3] = threshold_base << 2; + cpi->vbp_threshold_sad = 0; cpi->vbp_bsize_min = BLOCK_8X8; } else { cpi->vbp_thresholds[1] = threshold_base; if (cm->width <= 352 && cm->height <= 288) { cpi->vbp_thresholds[0] = threshold_base >> 2; cpi->vbp_thresholds[2] = threshold_base << 3; + cpi->vbp_threshold_sad = 100; } else { cpi->vbp_thresholds[0] = threshold_base; + cpi->vbp_thresholds[1] = (5 * threshold_base) >> 2; cpi->vbp_thresholds[2] = threshold_base << cpi->oxcf.speed; + cpi->vbp_threshold_sad = 1000; } cpi->vbp_bsize_min = BLOCK_16X16; } + cpi->vbp_threshold_minmax = 15 + (q >> 3); + } +} + +// Compute the minmax over the 8x8 subblocks. +static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d, + int dp, int x16_idx, int y16_idx, +#if CONFIG_VP9_HIGHBITDEPTH + int highbd_flag, +#endif + int pixels_wide, + int pixels_high) { + int k; + int minmax_max = 0; + int minmax_min = 255; + // Loop over the 4 8x8 subblocks. + for (k = 0; k < 4; k++) { + int x8_idx = x16_idx + ((k & 1) << 3); + int y8_idx = y16_idx + ((k >> 1) << 3); + int min = 0; + int max = 0; + if (x8_idx < pixels_wide && y8_idx < pixels_high) { +#if CONFIG_VP9_HIGHBITDEPTH + if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp, + d + y8_idx * dp + x8_idx, dp, + &min, &max); + } else { + vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp, + d + y8_idx * dp + x8_idx, dp, + &min, &max); + } +#else + vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp, + d + y8_idx * dp + x8_idx, dp, + &min, &max); +#endif + if ((max - min) > minmax_max) + minmax_max = (max - min); + if ((max - min) < minmax_min) + minmax_min = (max - min); + } } + return (minmax_max - minmax_min); } static void modify_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { @@ -510,6 +559,7 @@ static void modify_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { thresholds[2] = threshold_base << 3; } else { thresholds[0] = threshold_base; + thresholds[1] = (5 * threshold_base) >> 2; thresholds[2] = threshold_base << cpi->oxcf.speed; } } @@ -594,7 +644,7 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d, // This function chooses partitioning based on the variance between source and // reconstructed last, where variance is computed for down-sampled inputs. -static void choose_partitioning(VP9_COMP *cpi, +static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, MACROBLOCK *x, int mi_row, int mi_col) { @@ -603,7 +653,7 @@ static void choose_partitioning(VP9_COMP *cpi, int i, j, k, m; v64x64 vt; v16x16 vt2[16]; - int force_split[5]; + int force_split[21]; uint8_t *s; const uint8_t *d; int sp; @@ -699,6 +749,19 @@ static void choose_partitioning(VP9_COMP *cpi, d = xd->plane[0].dst.buf; dp = xd->plane[0].dst.stride; + + // If the y_sad is very small, take 64x64 as partition and exit. + // Don't check on boosted segment for now, as 64x64 is suppressed there. + if (segment_id == CR_SEGMENT_ID_BASE && + y_sad < cpi->vbp_threshold_sad) { + const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64]; + const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64]; + if (mi_col + block_width / 2 < cm->mi_cols && + mi_row + block_height / 2 < cm->mi_rows) { + set_block_size(cpi, xd, mi_row, mi_col, BLOCK_64X64); + return 0; + } + } } else { d = VP9_VAR_OFFS; dp = 0; @@ -721,6 +784,7 @@ static void choose_partitioning(VP9_COMP *cpi, } // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks, + // 5-20 for the 16x16 blocks. force_split[0] = 0; // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances // for splits. @@ -732,7 +796,9 @@ static void choose_partitioning(VP9_COMP *cpi, for (j = 0; j < 4; j++) { const int x16_idx = x32_idx + ((j & 1) << 4); const int y16_idx = y32_idx + ((j >> 1) << 4); + const int split_index = 5 + i2 + j; v16x16 *vst = &vt.split[i].split[j]; + force_split[split_index] = 0; variance4x4downsample[i2 + j] = 0; if (!is_key_frame) { fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst, @@ -743,15 +809,36 @@ static void choose_partitioning(VP9_COMP *cpi, pixels_high, is_key_frame); fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16); - // For low-resolution, compute the variance based on 8x8 down-sampling, - // and if it is large (above the threshold) we go down for 4x4. - // For key frame we always go down to 4x4. - if (low_res) - get_variance(&vt.split[i].split[j].part_variances.none); + get_variance(&vt.split[i].split[j].part_variances.none); + if (vt.split[i].split[j].part_variances.none.variance > + thresholds[2]) { + // 16X16 variance is above threshold for split, so force split to 8x8 + // for this 16x16 block (this also forces splits for upper levels). + force_split[split_index] = 1; + force_split[i + 1] = 1; + force_split[0] = 1; + } else if (vt.split[i].split[j].part_variances.none.variance > + thresholds[1] && + !cyclic_refresh_segment_id_boosted(segment_id)) { + // We have some nominal amount of 16x16 variance (based on average), + // compute the minmax over the 8x8 sub-blocks, and if above threshold, + // force split to 8x8 block for this 16x16 block. + int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx, +#if CONFIG_VP9_HIGHBITDEPTH + xd->cur_buf->flags, +#endif + pixels_wide, pixels_high); + if (minmax > cpi->vbp_threshold_minmax) { + force_split[split_index] = 1; + force_split[i + 1] = 1; + force_split[0] = 1; + } + } } if (is_key_frame || (low_res && vt.split[i].split[j].part_variances.none.variance > (thresholds[1] << 1))) { + force_split[split_index] = 0; // Go down to 4x4 down-sampling for variance. variance4x4downsample[i2 + j] = 1; for (k = 0; k < 4; k++) { @@ -786,16 +873,20 @@ static void choose_partitioning(VP9_COMP *cpi, fill_variance_tree(&vt.split[i], BLOCK_32X32); // If variance of this 32x32 block is above the threshold, force the block // to split. This also forces a split on the upper (64x64) level. - get_variance(&vt.split[i].part_variances.none); - if (vt.split[i].part_variances.none.variance > thresholds[1]) { - force_split[i + 1] = 1; - force_split[0] = 1; + if (!force_split[i + 1]) { + get_variance(&vt.split[i].part_variances.none); + if (vt.split[i].part_variances.none.variance > thresholds[1]) { + force_split[i + 1] = 1; + force_split[0] = 1; + } } } - if (!force_split[0]) + if (!force_split[0]) { fill_variance_tree(&vt, BLOCK_64X64); + get_variance(&vt.part_variances.none); + } - // Now go through the entire structure, splitting every block size until + // Now go through the entire structure, splitting every block size until // we get to one that's got a variance lower than our threshold. if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows || !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col, @@ -820,7 +911,9 @@ static void choose_partitioning(VP9_COMP *cpi, if (!set_vt_partitioning(cpi, xd, vtemp, BLOCK_16X16, mi_row + y32_idx + y16_idx, mi_col + x32_idx + x16_idx, - thresholds[2], cpi->vbp_bsize_min, 0)) { + thresholds[2], + cpi->vbp_bsize_min, + force_split[5 + i2 + j])) { for (k = 0; k < 4; ++k) { const int x8_idx = (k & 1); const int y8_idx = (k >> 1); @@ -847,6 +940,7 @@ static void choose_partitioning(VP9_COMP *cpi, } } } + return 0; } static void update_state(VP9_COMP *cpi, ThreadData *td, diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index a5342ade3..42305a91c 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -463,6 +463,8 @@ typedef struct VP9_COMP { // 0 - threshold_64x64; 1 - threshold_32x32; // 2 - threshold_16x16; 3 - vbp_threshold_8x8; int64_t vbp_thresholds[4]; + int64_t vbp_threshold_minmax; + int64_t vbp_threshold_sad; BLOCK_SIZE vbp_bsize_min; // Multi-threading diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c index ecd6ce9a2..4672aa6b8 100644 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c @@ -11,6 +11,83 @@ #include #include "vpx_ports/mem.h" +void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; + u0 = _mm_setzero_si128(); + // Row 0 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff0 = _mm_max_epi16(diff, negdiff); + // Row 1 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(absdiff0, absdiff); + minabsdiff = _mm_min_epi16(absdiff0, absdiff); + // Row 2 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 3 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 4 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 5 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 6 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + // Row 7 + s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); + d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); + diff = _mm_subs_epi16(s0, d0); + negdiff = _mm_subs_epi16(u0, diff); + absdiff = _mm_max_epi16(diff, negdiff); + maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); + minabsdiff = _mm_min_epi16(minabsdiff, absdiff); + + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); + maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); + *max = _mm_extract_epi16(maxabsdiff, 0); + + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); + minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); + *min = _mm_extract_epi16(minabsdiff, 0); +} unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) { __m128i s0, s1, u0; -- 2.40.0