From: Yunqing Wang Date: Thu, 1 May 2014 22:14:39 +0000 (-0700) Subject: Decide the partitioning threshold from the variance histogram X-Git-Tag: v1.4.0~1299 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9d41313e4bd28d3c504c26079cbe4a499e7eceb7;p=libvpx Decide the partitioning threshold from the variance histogram Before encoding a frame, calculate and store each 16x16 block's variance of source difference between last and current frame. Find partitioning threshold T for the frame from its variance histogram, and then use T to make partition decisions. Comparing with fixed 16x16 partitioning, rtc set test showed an overall psnr gain of 3.242%, and ssim gain of 3.751%. The best psnr gain is 8.653%. The overall encoding speed didn't change much. It got faster for some clips(for example, 12% speedup for vidyo1), and a little slower for others. Also, a minor modification was made in datarate unit test. Change-Id: Ie290743aa3814e83607b93831b667a2a49d0932c --- diff --git a/test/datarate_test.cc b/test/datarate_test.cc index 80be05ee9..8dcf26ca2 100644 --- a/test/datarate_test.cc +++ b/test/datarate_test.cc @@ -576,7 +576,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting3TemporalLayersFrameDropping) { // Expect some frame drops in this test: for this 200 frames test, // expect at least 10% and not more than 60% drops. ASSERT_GE(num_drops_, 20); - ASSERT_LE(num_drops_, 120); + ASSERT_LE(num_drops_, 130); } } diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 454d0da90..61d9d5d1e 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -20,6 +20,12 @@ extern "C" { #endif +typedef struct { + unsigned int sse; + int sum; + unsigned int var; +} diff; + struct macroblock_plane { DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]); int16_t *qcoeff; diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index c9825eab5..d41355136 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -70,12 +70,6 @@ static const uint8_t VP9_VAR_OFFS[64] = { 128, 128, 128, 128, 128, 128, 128, 128 }; -typedef struct { - unsigned int sse; - int sum; - unsigned int var; -} diff; - static unsigned int get_sby_perpixel_variance(VP9_COMP *cpi, const struct buf_2d *ref, BLOCK_SIZE bs) { @@ -1140,7 +1134,6 @@ static void constrain_copy_partitioning(VP9_COMP *const cpi, } } - const struct { int row; int col; @@ -1173,34 +1166,26 @@ static void set_source_var_based_partition(VP9_COMP *cpi, // In-image SB64 if ((col8x8_remaining >= MI_BLOCK_SIZE) && (row8x8_remaining >= MI_BLOCK_SIZE)) { - const int src_stride = x->plane[0].src.stride; - const int pre_stride = cpi->Last_Source->y_stride; - const uint8_t *src = x->plane[0].src.buf; - const int pre_offset = (mi_row * MI_SIZE) * pre_stride + - (mi_col * MI_SIZE); - const uint8_t *pre_src = cpi->Last_Source->y_buffer + pre_offset; - const unsigned int thr_32x32 = cpi->sf.source_var_thresh; - const unsigned int thr_64x64 = thr_32x32 << 1; int i, j; int index; diff d32[4]; - int use16x16 = 0; + const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1); + int is_larger_better = 0; + int use32x32 = 0; + int thr = cpi->source_var_thresh; + + vpx_memset(d32, 0, 4 * sizeof(diff)); for (i = 0; i < 4; i++) { - diff d16[4]; + diff *d16[4]; for (j = 0; j < 4; j++) { int b_mi_row = coord_lookup[i * 4 + j].row; int b_mi_col = coord_lookup[i * 4 + j].col; - int b_offset = b_mi_row * MI_SIZE * src_stride + - b_mi_col * MI_SIZE; - - vp9_get16x16var(src + b_offset, src_stride, - pre_src + b_offset, pre_stride, - &d16[j].sse, &d16[j].sum); + int boffset = b_mi_row / 2 * cm->mb_cols + + b_mi_col / 2; - d16[j].var = d16[j].sse - - (((uint32_t)d16[j].sum * d16[j].sum) >> 8); + d16[j] = cpi->source_diff_var + offset + boffset; index = b_mi_row * mis + b_mi_col; mi_8x8[index] = mi_upper_left + index; @@ -1210,14 +1195,16 @@ static void set_source_var_based_partition(VP9_COMP *cpi, // size to further improve quality. } - if (d16[0].var < thr_32x32 && d16[1].var < thr_32x32 && - d16[2].var < thr_32x32 && d16[3].var < thr_32x32) { - d32[i].sse = d16[0].sse; - d32[i].sum = d16[0].sum; + is_larger_better = (d16[0]->var < thr) && (d16[1]->var < thr) && + (d16[2]->var < thr) && (d16[3]->var < thr); + + // Use 32x32 partition + if (is_larger_better) { + use32x32 += 1; - for (j = 1; j < 4; j++) { - d32[i].sse += d16[j].sse; - d32[i].sum += d16[j].sum; + for (j = 0; j < 4; j++) { + d32[i].sse += d16[j]->sse; + d32[i].sum += d16[j]->sum; } d32[i].var = d32[i].sse - (((int64_t)d32[i].sum * d32[i].sum) >> 10); @@ -1225,18 +1212,16 @@ static void set_source_var_based_partition(VP9_COMP *cpi, index = coord_lookup[i*4].row * mis + coord_lookup[i*4].col; mi_8x8[index] = mi_upper_left + index; mi_8x8[index]->mbmi.sb_type = BLOCK_32X32; - - if (!((cm->current_video_frame - 1) % - cpi->sf.search_type_check_frequency)) - cpi->use_large_partition_rate += 1; - } else { - use16x16 = 1; } } - if (!use16x16) { - if (d32[0].var < thr_64x64 && d32[1].var < thr_64x64 && - d32[2].var < thr_64x64 && d32[3].var < thr_64x64) { + if (use32x32 == 4) { + thr <<= 1; + is_larger_better = (d32[0].var < thr) && (d32[1].var < thr) && + (d32[2].var < thr) && (d32[3].var < thr); + + // Use 64x64 partition + if (is_larger_better) { mi_8x8[0] = mi_upper_left; mi_8x8[0]->mbmi.sb_type = BLOCK_64X64; } @@ -2938,6 +2923,93 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, } // end RTC play code +static int set_var_thresh_from_histogram(VP9_COMP *cpi) { + SPEED_FEATURES *const sf = &cpi->sf; + VP9_COMMON *const cm = &cpi->common; + + const uint8_t *src = cpi->Source->y_buffer; + const uint8_t *last_src = cpi->Last_Source->y_buffer; + const int src_stride = cpi->Source->y_stride; + const int last_stride = cpi->Last_Source->y_stride; + + // Pick cutoff threshold + const int cutoff = (MIN(cm->width, cm->height) >= 720) ? + (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100) : + (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100); + DECLARE_ALIGNED_ARRAY(16, int, hist, VAR_HIST_BINS); + diff *var16 = cpi->source_diff_var; + + int sum = 0; + int i, j; + + vpx_memset(hist, 0, VAR_HIST_BINS * sizeof(hist[0])); + + for (i = 0; i < cm->mb_rows; i++) { + for (j = 0; j < cm->mb_cols; j++) { + vp9_get16x16var(src, src_stride, last_src, last_stride, + &var16->sse, &var16->sum); + + var16->var = var16->sse - + (((uint32_t)var16->sum * var16->sum) >> 8); + + if (var16->var >= VAR_HIST_MAX_BG_VAR) + hist[VAR_HIST_BINS - 1]++; + else + hist[var16->var / VAR_HIST_FACTOR]++; + + src += 16; + last_src += 16; + var16++; + } + + src = src - cm->mb_cols * 16 + 16 * src_stride; + last_src = last_src - cm->mb_cols * 16 + 16 * last_stride; + } + + cpi->source_var_thresh = 0; + + if (hist[VAR_HIST_BINS - 1] < cutoff) { + for (i = 0; i < VAR_HIST_BINS - 1; i++) { + sum += hist[i]; + + if (sum > cutoff) { + cpi->source_var_thresh = (i + 1) * VAR_HIST_FACTOR; + return 0; + } + } + } + + return sf->search_type_check_frequency; +} + +static void source_var_based_partition_search_method(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + SPEED_FEATURES *const sf = &cpi->sf; + + if (cm->frame_type == KEY_FRAME) { + // For key frame, use SEARCH_PARTITION. + sf->partition_search_type = SEARCH_PARTITION; + } else if (cm->intra_only) { + sf->partition_search_type = FIXED_PARTITION; + } else { + if (cm->last_width != cm->width || cm->last_height != cm->height) { + if (cpi->source_diff_var) + vpx_free(cpi->source_diff_var); + + CHECK_MEM_ERROR(cm, cpi->source_diff_var, + vpx_calloc(cm->MBs, sizeof(diff))); + } + + if (!cpi->frames_till_next_var_check) + cpi->frames_till_next_var_check = set_var_thresh_from_histogram(cpi); + + if (cpi->frames_till_next_var_check > 0) { + sf->partition_search_type = FIXED_PARTITION; + cpi->frames_till_next_var_check--; + } + } +} + static int get_skip_encode_frame(const VP9_COMMON *cm) { unsigned int intra_count = 0, inter_count = 0; int j; @@ -3037,28 +3109,8 @@ static void encode_frame_internal(VP9_COMP *cpi) { } vp9_zero(x->zcoeff_blk); - if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION && - cm->current_video_frame > 0) { - int check_freq = sf->search_type_check_frequency; - - if ((cm->current_video_frame - 1) % check_freq == 0) { - cpi->use_large_partition_rate = 0; - } - - if ((cm->current_video_frame - 1) % check_freq == 1) { - const int mbs_in_b32x32 = 1 << ((b_width_log2_lookup[BLOCK_32X32] - - b_width_log2_lookup[BLOCK_16X16]) + - (b_height_log2_lookup[BLOCK_32X32] - - b_height_log2_lookup[BLOCK_16X16])); - cpi->use_large_partition_rate = cpi->use_large_partition_rate * 100 * - mbs_in_b32x32 / cm->MBs; - } - - if ((cm->current_video_frame - 1) % check_freq >= 1) { - if (cpi->use_large_partition_rate < 15) - sf->partition_search_type = FIXED_PARTITION; - } - } + if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION) + source_var_based_partition_search_method(cpi); } { diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h index 72343cdf2..fd1c9aa64 100644 --- a/vp9/encoder/vp9_encodeframe.h +++ b/vp9/encoder/vp9_encodeframe.h @@ -20,6 +20,13 @@ struct macroblock; struct yv12_buffer_config; struct VP9_COMP; +// Constants used in SOURCE_VAR_BASED_PARTITION +#define VAR_HIST_MAX_BG_VAR 1000 +#define VAR_HIST_FACTOR 10 +#define VAR_HIST_BINS (VAR_HIST_MAX_BG_VAR / VAR_HIST_FACTOR + 1) +#define VAR_HIST_LARGE_CUT_OFF 75 +#define VAR_HIST_SMALL_CUT_OFF 45 + void vp9_setup_src_planes(struct macroblock *x, const struct yv12_buffer_config *src, int mi_row, int mi_col); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index ecfefd3ba..54fb68bb6 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -194,6 +194,11 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { lc->rc_twopass_stats_in.buf = NULL; lc->rc_twopass_stats_in.sz = 0; } + + if (cpi->source_diff_var != NULL) { + vpx_free(cpi->source_diff_var); + cpi->source_diff_var = NULL; + } } static void save_coding_context(VP9_COMP *cpi) { @@ -907,6 +912,12 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { set_speed_features(cpi); + // Allocate memory to store variances for a frame. + CHECK_MEM_ERROR(cm, cpi->source_diff_var, + vpx_calloc(cm->MBs, sizeof(diff))); + cpi->source_var_thresh = 0; + cpi->frames_till_next_var_check = 0; + // Default rd threshold factors for mode selection for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index ee98baa96..5e8430a91 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -401,7 +401,11 @@ typedef struct VP9_COMP { SVC svc; - int use_large_partition_rate; + // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type. + diff *source_diff_var; + // The threshold used in SOURCE_VAR_BASED_PARTITION search type. + unsigned int source_var_thresh; + int frames_till_next_var_check; int frame_flags; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index cf60c8f1c..b8a2ce0ff 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -270,7 +270,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, // Adaptively switch between SOURCE_VAR_BASED_PARTITION and FIXED_PARTITION. sf->partition_search_type = SOURCE_VAR_BASED_PARTITION; sf->search_type_check_frequency = 50; - sf->source_var_thresh = 360; sf->tx_size_search_method = (cm->frame_type == KEY_FRAME) ? USE_LARGESTALL : USE_TX_8X8; @@ -350,7 +349,6 @@ void vp9_set_speed_features(VP9_COMP *cpi) { // to FIXED_PARTITION. sf->always_this_block_size = BLOCK_16X16; sf->search_type_check_frequency = 50; - sf->source_var_thresh = 100; // Recode loop tolerence %. sf->recode_tolerance = 25; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 5160633ff..7a5cc34bc 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -353,9 +353,6 @@ typedef struct SPEED_FEATURES { // FIXED_PARTITION search type should be used. int search_type_check_frequency; - // The threshold used in SOURCE_VAR_BASED_PARTITION search type. - unsigned int source_var_thresh; - // When partition is pre-set, the inter prediction result from pick_inter_mode // can be reused in final block encoding process. It is enabled only for real- // time mode speed 6.