From: Marco Date: Thu, 23 Mar 2017 21:37:47 +0000 (-0700) Subject: vp9: 1 pass: Move source sad computation into encodeframe loop. X-Git-Tag: v1.7.0~595 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=66c6b4d6fc23900380853dbffa86656a2078dd8c;p=libvpx vp9: 1 pass: Move source sad computation into encodeframe loop. Refactor to split the 1 passs source sad computation into scene detection (currently used for VBR and screen-content mode), and superblock based source sad computation (used in non-rd CBR mode). This allows the source sad computation for CBR mode to be multi-threaded. No change in compression. Change-Id: I112f2918613ccbd37c1771d852606d3af18c1388 --- diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index d6b2ad787..42dc6830d 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -175,6 +175,10 @@ struct macroblock { uint8_t last_sb_high_content; + // For each superblock: saves the content value (e.g., low/high sad/sumdiff) + // based on source sad, prior to encoding the frame. + uint8_t content_state_sb; + // Used to save the status of whether a block has a low variance in // choose_partitioning. 0 for 64x64, 1~2 for 64x32, 3~4 for 32x64, 5~8 for // 32x32, 9~24 for 16x16. diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index cbc53047c..b6668462b 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -963,6 +963,46 @@ static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize, } } +static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift, + int sb_offset) { + unsigned int tmp_sse; + uint64_t tmp_sad; + unsigned int tmp_variance; + const BLOCK_SIZE bsize = BLOCK_64X64; + uint8_t *src_y = cpi->Source->y_buffer; + int src_ystride = cpi->Source->y_stride; + uint8_t *last_src_y = cpi->Last_Source->y_buffer; + int last_src_ystride = cpi->Last_Source->y_stride; + uint64_t avg_source_sad_threshold = 10000; + uint64_t avg_source_sad_threshold2 = 12000; +#if CONFIG_VP9_HIGHBITDEPTH + if (cpi->common.use_highbitdepth) return; +#endif + src_y += shift; + last_src_y += shift; + tmp_sad = + cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, last_src_ystride); + tmp_variance = vpx_variance64x64(src_y, src_ystride, last_src_y, + last_src_ystride, &tmp_sse); + // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12) + if (tmp_sad < avg_source_sad_threshold) + x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kLowSadLowSumdiff + : kLowSadHighSumdiff; + else + x->content_state_sb = ((tmp_sse - tmp_variance) < 25) ? kHighSadLowSumdiff + : kHighSadHighSumdiff; + if (cpi->content_state_sb_fd != NULL) { + if (tmp_sad < avg_source_sad_threshold2) { + // Cap the increment to 255. + if (cpi->content_state_sb_fd[sb_offset] < 255) + cpi->content_state_sb_fd[sb_offset]++; + } else { + cpi->content_state_sb_fd[sb_offset] = 0; + } + } + return; +} + // This function chooses partitioning based on the variance between source and // reconstructed last, where variance is computed for down-sampled inputs. static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, @@ -1011,17 +1051,15 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); segment_id = xd->mi[0]->segment_id; - if (cpi->sf.use_source_sad && cpi->content_state_sb != NULL && - !is_key_frame) { - // The sb_offset2 is to make it consistent with the index in the function - // vp9_avg_source_sad() in vp9_ratectrl.c. + if (cpi->sf.use_source_sad && !is_key_frame) { int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); - content_state = cpi->content_state_sb[sb_offset2]; + content_state = x->content_state_sb; x->skip_low_source_sad = (content_state == kLowSadLowSumdiff || content_state == kLowSadHighSumdiff) ? 1 : 0; - x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2]; + if (cpi->content_state_sb_fd != NULL) + x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2]; // If source_sad is low copy the partition without computing the y_sad. if (x->skip_low_source_sad && cpi->sf.copy_partition_flag && copy_partitioning(cpi, x, mi_row, mi_col, segment_id, sb_offset)) { @@ -4063,6 +4101,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, x->color_sensitivity[1] = 0; x->sb_is_skin = 0; x->skip_low_source_sad = 0; + x->content_state_sb = 0; if (seg->enabled) { const uint8_t *const map = @@ -4074,6 +4113,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, } } + if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) { + int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3); + int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3); + avg_source_sad(cpi, x, shift, sb_offset2); + } + // Set the partition type of the 64X64 block switch (partition_search_type) { case VAR_BASED_PARTITION: diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index dde6f62fc..c891957a3 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -463,9 +463,6 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { vpx_free(cpi->copied_frame_cnt); cpi->copied_frame_cnt = NULL; - vpx_free(cpi->content_state_sb); - cpi->content_state_sb = NULL; - vpx_free(cpi->content_state_sb_fd); cpi->content_state_sb_fd = NULL; @@ -3094,9 +3091,11 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest) { VP9_COMMON *const cm = &cpi->common; int q = 0, bottom_index = 0, top_index = 0; // Dummy variables. - int compute_source_sad = cpi->sf.use_source_sad || - cpi->oxcf.content == VP9E_CONTENT_SCREEN || - cpi->oxcf.rc_mode == VPX_VBR; + // Flag to check if its valid to compute the source sad (used for + // scene detection and for superblock content state in CBR mode). + // The flag may get reset below based on SVC or resizing state. + cpi->compute_source_sad_onepass = + cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cm->show_frame; vpx_clear_system_state(); @@ -3144,16 +3143,13 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if ((cpi->use_svc && (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1 || cpi->svc.current_superframe < 1)) || - cpi->resize_pending || cpi->resize_state || cpi->external_resize) { - compute_source_sad = 0; - if (cpi->content_state_sb != NULL) { - memset(cpi->content_state_sb, 0, (cm->mi_stride >> 3) * - ((cm->mi_rows >> 3) + 1) * - sizeof(*cpi->content_state_sb)); + cpi->resize_pending || cpi->resize_state || cpi->external_resize || + cpi->resize_state != ORIG) { + cpi->compute_source_sad_onepass = 0; + if (cpi->content_state_sb_fd != NULL) memset(cpi->content_state_sb_fd, 0, (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * sizeof(*cpi->content_state_sb_fd)); - } } // Avoid scaling last_source unless its needed. @@ -3166,11 +3162,16 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5) || cpi->sf.partition_search_type == SOURCE_VAR_BASED_PARTITION || (cpi->noise_estimate.enabled && !cpi->oxcf.noise_sensitivity) || - compute_source_sad)) + cpi->compute_source_sad_onepass)) cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source, &cpi->scaled_last_source, (cpi->oxcf.pass == 0)); + if (cpi->Last_Source == NULL || + cpi->Last_Source->y_width != cpi->Source->y_width || + cpi->Last_Source->y_height != cpi->Source->y_height) + cpi->compute_source_sad_onepass = 0; + if (cm->frame_type == KEY_FRAME || cpi->resize_pending != 0) { memset(cpi->consec_zero_mv, 0, cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv)); @@ -3178,15 +3179,13 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, vp9_update_noise_estimate(cpi); - // Compute source_sad if the flag compute_source_sad is set, and - // only for 1 pass realtime speed >= 5 with show_frame = 1. - // TODO(jianj): Look into removing the condition on resize_state, - // and improving these conditions (i.e., better handle SVC case and combine - // them with condition above in compute_source_sad). - if (cpi->oxcf.pass == 0 && cpi->oxcf.mode == REALTIME && - cpi->oxcf.speed >= 5 && cpi->resize_state == ORIG && compute_source_sad && - cm->show_frame) - vp9_avg_source_sad(cpi); + // Scene detection is used for VBR mode or screen-content case. + // Make sure compute_source_sad_onepass is set (which handles SVC case + // and dynamic resize). + if (cpi->compute_source_sad_onepass && + (cpi->oxcf.rc_mode == VPX_VBR || + cpi->oxcf.content == VP9E_CONTENT_SCREEN)) + vp9_scene_detection_onepass(cpi); // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference // frame (i.e, svc->force_zero_mode_spatial_ref = 0), we can avoid this diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 346a6d8b8..2e788e04d 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -708,13 +708,12 @@ typedef struct VP9_COMP { uint8_t *copied_frame_cnt; uint8_t max_copied_frame; - // For each superblock: saves the content value (e.g., low/high sad/sumdiff) - // based on source sad, prior to encoding the frame. - uint8_t *content_state_sb; // For each superblock: keeps track of the last time (in frame distance) the // the superblock did not have low source sad. uint8_t *content_state_sb_fd; + int compute_source_sad_onepass; + LevelConstraint level_constraint; } VP9_COMP; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index d7a0e1aa8..4500124f9 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -2213,7 +2213,7 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { // in content and allow rate control to react. // This function also handles special case of lag_in_frames, to measure content // level in #future frames set by the lag_in_frames. -void vp9_avg_source_sad(VP9_COMP *cpi) { +void vp9_scene_detection_onepass(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; RATE_CONTROL *const rc = &cpi->rc; #if CONFIG_VP9_HIGHBITDEPTH @@ -2284,8 +2284,6 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { int num_samples = 0; int sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; int sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE; - uint64_t avg_source_sad_threshold = 10000; - uint64_t avg_source_sad_threshold2 = 12000; if (cpi->oxcf.lag_in_frames > 0) { src_y = frames[frame]->y_buffer; src_ystride = frames[frame]->y_stride; @@ -2296,34 +2294,12 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { // Checker-board pattern, ignore boundary. // If the use_source_sad is on, compute for every superblock. - if (cpi->sf.use_source_sad || - ((sbi_row > 0 && sbi_col > 0) && + if (((sbi_row > 0 && sbi_col > 0) && (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) && ((sbi_row % 2 == 0 && sbi_col % 2 == 0) || (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) { tmp_sad = cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, last_src_ystride); - if (cpi->sf.use_source_sad && cpi->content_state_sb != NULL) { - unsigned int tmp_sse; - unsigned int tmp_variance = vpx_variance64x64( - src_y, src_ystride, last_src_y, last_src_ystride, &tmp_sse); - // Note: tmp_sse - tmp_variance = ((sum * sum) >> 12) - if (tmp_sad < avg_source_sad_threshold) - cpi->content_state_sb[num_samples] = - ((tmp_sse - tmp_variance) < 25) ? kLowSadLowSumdiff - : kLowSadHighSumdiff; - else - cpi->content_state_sb[num_samples] = - ((tmp_sse - tmp_variance) < 25) ? kHighSadLowSumdiff - : kHighSadHighSumdiff; - if (tmp_sad < avg_source_sad_threshold2) { - // Cap the increment to 255. - if (cpi->content_state_sb_fd[num_samples] < 255) - cpi->content_state_sb_fd[num_samples]++; - } else { - cpi->content_state_sb_fd[num_samples] = 0; - } - } avg_sad += tmp_sad; num_samples++; } diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 32353d38e..9e4623195 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -283,7 +283,7 @@ void vp9_set_target_rate(struct VP9_COMP *cpi); int vp9_resize_one_pass_cbr(struct VP9_COMP *cpi); -void vp9_avg_source_sad(struct VP9_COMP *cpi); +void vp9_scene_detection_onepass(struct VP9_COMP *cpi); int vp9_encodedframe_overshoot(struct VP9_COMP *cpi, int frame_size, int *q); diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 01c0ad62c..c06ca03d4 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -512,12 +512,9 @@ static void set_rt_speed_feature_framesize_independent( } if (!cpi->external_resize) sf->use_source_sad = 1; if (sf->use_source_sad) { - // For SVC allocate for top layer. - if (cpi->content_state_sb == NULL && + if (cpi->content_state_sb_fd == NULL && (!cpi->use_svc || cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) { - cpi->content_state_sb = (uint8_t *)vpx_calloc( - (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t)); cpi->content_state_sb_fd = (uint8_t *)vpx_calloc( (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t)); }