From 6b7848d4c9016efbfbc9529df008ffde0e15b7cc Mon Sep 17 00:00:00 2001 From: Hui Su Date: Sat, 29 Sep 2018 14:48:56 -0700 Subject: [PATCH] Introduce the ml_var_partition_pruning feature Add the ml_var_partition_pruning encoder speed feature that uses a neural net model to prune partition-none and partition-split search. The model uses prediction residue variance and quantization step size as input features. Encoding speed gain for speed 0(tested over 20 hdres clips): QP=30 QP=40 average 17.7% 18.3% max 24.46% 26.6% Coding loss: lowres 0.071%; midres 0.098%; hdres 0.163% Currently it is enabled for speed 0 low-bit depth only. It needs to be tuned for other settings. Change-Id: Ifb7417daa6bb6e7c97bb676269ce54ab0dc7b8c8 --- vp9/encoder/vp9_encodeframe.c | 152 +++++++++++++++++++++++++ vp9/encoder/vp9_partition_models.h | 176 ++++++++++++++++++++++++++++- vp9/encoder/vp9_speed_features.c | 3 + vp9/encoder/vp9_speed_features.h | 4 + 4 files changed, 334 insertions(+), 1 deletion(-) diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 2cc7da045..ad30951af 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3393,6 +3393,140 @@ static void ml_prune_rect_partition(VP9_COMP *const cpi, MACROBLOCK *const x, #undef FEATURES #undef LABELS +// Use a neural net model to prune partition-none and partition-split search. +// The model uses prediction residue variance and quantization step size as +// input features. +#define FEATURES 6 +static void ml_predict_var_rd_paritioning(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, int mi_row, + int mi_col, int *none, int *split) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *xd = &x->e_mbd; + MODE_INFO *mi = xd->mi[0]; + const NN_CONFIG *nn_config = NULL; + DECLARE_ALIGNED(16, uint8_t, pred_buf[64 * 64]); + int i; + float thresh_low = -1.0f; + float thresh_high = 0.0f; + + switch (bsize) { + case BLOCK_64X64: + nn_config = &vp9_var_rd_part_nnconfig_64; + thresh_low = -3.0f; + thresh_high = 3.0f; + break; + case BLOCK_32X32: + nn_config = &vp9_var_rd_part_nnconfig_32; + thresh_low = -3.0; + thresh_high = 3.0f; + break; + case BLOCK_16X16: + nn_config = &vp9_var_rd_part_nnconfig_16; + thresh_low = -4.0; + thresh_high = 4.0f; + break; + case BLOCK_8X8: + nn_config = &vp9_var_rd_part_nnconfig_8; + thresh_low = -2.0; + thresh_high = 2.0f; + break; + default: assert(0 && "Unexpected block size."); return; + } + + if (!nn_config) return; + + mi->ref_frame[1] = NONE; + mi->sb_type = bsize; + // Do a simple single motion search to find a prediction for current block. + // The variance of the residue will be used as input features. + { + const MV_REFERENCE_FRAME ref = + cpi->rc.is_src_frame_alt_ref ? ALTREF_FRAME : LAST_FRAME; + YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref); + MV ref_mv = { 0, 0 }; + MV ref_mv_full = { 0, 0 }; + const int step_param = 1; + const MvLimits tmp_mv_limits = x->mv_limits; + const SEARCH_METHODS search_method = NSTEP; + const int sadpb = x->sadperbit16; + MV best_mv = { 0, 0 }; + int cost_list[5]; + + assert(yv12 != NULL); + if (!yv12) return; + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ref - 1].sf); + mi->ref_frame[0] = ref; + vp9_set_mv_search_range(&x->mv_limits, &ref_mv); + vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, + search_method, sadpb, cond_cost_list(cpi, cost_list), + &ref_mv, &best_mv, 0, 0); + best_mv.row *= 8; + best_mv.col *= 8; + x->mv_limits = tmp_mv_limits; + mi->mv[0].as_mv = best_mv; + + set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]); + xd->plane[0].dst.buf = pred_buf; + xd->plane[0].dst.stride = 64; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize); + } + + vpx_clear_system_state(); + + { + float features[FEATURES] = { 0.0f }; + const int dc_q = vp9_dc_quant(cm->base_qindex, 0, cm->bit_depth); + int feature_idx = 0; + float score; + + // Generate model input features. + features[feature_idx++] = logf((float)(dc_q * dc_q) / 256.0f + 1.0f); + vp9_setup_src_planes(x, cpi->Source, mi_row, mi_col); + // Get the variance of the residue as input features. + { + const int bs = 4 * num_4x4_blocks_wide_lookup[bsize]; + const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT); + const uint8_t *pred = pred_buf; + const uint8_t *src = x->plane[0].src.buf; + const int src_stride = x->plane[0].src.stride; + const int pred_stride = 64; + unsigned int sse; + // Variance of whole block. + const unsigned int var = + cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse); + const float factor = (var == 0) ? 1.0f : (1.0f / (float)var); + + features[feature_idx++] = logf((float)var + 1.0f); + for (i = 0; i < 4; ++i) { + const int x_idx = (i & 1) * bs / 2; + const int y_idx = (i >> 1) * bs / 2; + const int src_offset = y_idx * src_stride + x_idx; + const int pred_offset = y_idx * pred_stride + x_idx; + // Variance of quarter block. + const unsigned int sub_var = + cpi->fn_ptr[subsize].vf(src + src_offset, src_stride, + pred + pred_offset, pred_stride, &sse); + const float var_ratio = (var == 0) ? 1.0f : factor * (float)sub_var; + features[feature_idx++] = var_ratio; + } + } + assert(feature_idx == FEATURES); + + // Feed the features into the model to get the confidence score. + nn_predict(features, nn_config, &score); + + // Higher score means that the model has higher confidence that the split + // partition is better than the non-split partition. So if the score is + // high enough, we skip the none-split partition search; if the score is + // low enough, we skip the split partition search. + if (score > thresh_high) *none = 0; + if (score < thresh_low) *split = 0; + } +} +#undef FEATURES +#undef LABELS + int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, int mi_col, int orig_rdmult) { TplDepFrame *tpl_frame = &cpi->tpl_stats[cpi->twopass.gf_group.index]; @@ -3624,6 +3758,21 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, pc_tree->partitioning = PARTITION_NONE; + if (cpi->sf.ml_var_partition_pruning) { + int do_ml_var_partition_pruning = + !frame_is_intra_only(cm) && partition_none_allowed && do_split && + mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows && + mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols; +#if CONFIG_VP9_HIGHBITDEPTH + if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + do_ml_var_partition_pruning = 0; +#endif // CONFIG_VP9_HIGHBITDEPTH + if (do_ml_var_partition_pruning) { + ml_predict_var_rd_paritioning(cpi, x, bsize, mi_row, mi_col, + &partition_none_allowed, &do_split); + } + } + // PARTITION_NONE if (partition_none_allowed) { rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, bsize, ctx, @@ -3738,6 +3887,9 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td, } } restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize); + } else { + vp9_zero(ctx->pred_mv); + ctx->mic.interp_filter = EIGHTTAP; } // store estimated motion vector diff --git a/vp9/encoder/vp9_partition_models.h b/vp9/encoder/vp9_partition_models.h index 17c9013da..904d21400 100644 --- a/vp9/encoder/vp9_partition_models.h +++ b/vp9/encoder/vp9_partition_models.h @@ -18,7 +18,9 @@ extern "C" { #define NN_MAX_HIDDEN_LAYERS 10 #define NN_MAX_NODES_PER_LAYER 128 -// Neural net model config. +// Neural net model config. It defines the layout of a neural net model, such as +// the number of inputs/outputs, number of layers, the number of nodes in each +// layer, as well as the weights and bias of each node. typedef struct { int num_inputs; // Number of input nodes, i.e. features. int num_outputs; // Number of output nodes. @@ -964,6 +966,178 @@ static const NN_CONFIG vp9_var_part_nnconfig_16 = { #undef FEATURES #endif // CONFIG_ML_VAR_PARTITION +#define FEATURES 6 +#define LABELS 1 +static const float vp9_var_rd_part_nn_weights_64_layer0[FEATURES * 8] = { + -0.100129f, 0.128867f, -1.375086f, -2.268096f, -1.470368f, -2.296274f, + 0.034445f, -0.062993f, -2.151904f, 0.523215f, 1.611269f, 1.530051f, + 0.418182f, -1.330239f, 0.828388f, 0.386546f, -0.026188f, -0.055459f, + -0.474437f, 0.861295f, -2.208743f, -0.652991f, -2.985873f, -1.728956f, + 0.388052f, -0.420720f, 2.015495f, 1.280342f, 3.040914f, 1.760749f, + -0.009062f, 0.009623f, 1.579270f, -2.012891f, 1.629662f, -1.796016f, + -0.279782f, -0.288359f, 1.875618f, 1.639855f, 0.903020f, 0.906438f, + 0.553394f, -1.621589f, 0.185063f, 0.605207f, -0.133560f, 0.588689f, +}; + +static const float vp9_var_rd_part_nn_bias_64_layer0[8] = { + 0.659717f, 0.120912f, 0.329894f, -1.586385f, + 1.715839f, 0.085754f, 2.038774f, 0.268119f, +}; + +static const float vp9_var_rd_part_nn_weights_64_layer1[8 * LABELS] = { + -3.445586f, 2.375620f, 1.236970f, 0.804030f, + -2.448384f, 2.827254f, 2.291478f, 0.790252f, +}; + +static const float vp9_var_rd_part_nn_bias_64_layer1[LABELS] = { + -1.16608453f, +}; + +static const NN_CONFIG vp9_var_rd_part_nnconfig_64 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_rd_part_nn_weights_64_layer0, + vp9_var_rd_part_nn_weights_64_layer1, + }, + { + vp9_var_rd_part_nn_bias_64_layer0, + vp9_var_rd_part_nn_bias_64_layer1, + }, +}; + +static const float vp9_var_rd_part_nn_weights_32_layer0[FEATURES * 8] = { + 0.022420f, -0.032201f, 1.228065f, -2.767655f, 1.928743f, 0.566863f, + 0.459229f, 0.422048f, 0.833395f, 0.822960f, -0.232227f, 0.586895f, + 0.442856f, -0.018564f, 0.227672f, -1.291306f, 0.119428f, -0.776563f, + -0.042947f, 0.183129f, 0.592231f, 1.174859f, -0.503868f, 0.270102f, + -0.330537f, -0.036340f, 1.144630f, 1.783710f, 1.216929f, 2.038085f, + 0.373782f, -0.430258f, 1.957002f, 1.383908f, 2.012261f, 1.585693f, + -0.394399f, -0.337523f, -0.238335f, 0.007819f, -0.368294f, 0.437875f, + -0.318923f, -0.242000f, 2.276263f, 1.501432f, 0.645706f, 0.344774f, +}; + +static const float vp9_var_rd_part_nn_bias_32_layer0[8] = { + -0.023846f, -1.348117f, 1.365007f, -1.644164f, + 0.062992f, 1.257980f, -0.098642f, 1.388472f, +}; + +static const float vp9_var_rd_part_nn_weights_32_layer1[8 * LABELS] = { + 3.016729f, 0.622684f, -1.021302f, 1.490383f, + 1.702046f, -2.964618f, 0.689045f, 1.711754f, +}; + +static const float vp9_var_rd_part_nn_bias_32_layer1[LABELS] = { + -1.28798676f, +}; + +static const NN_CONFIG vp9_var_rd_part_nnconfig_32 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_rd_part_nn_weights_32_layer0, + vp9_var_rd_part_nn_weights_32_layer1, + }, + { + vp9_var_rd_part_nn_bias_32_layer0, + vp9_var_rd_part_nn_bias_32_layer1, + }, +}; + +static const float vp9_var_rd_part_nn_weights_16_layer0[FEATURES * 8] = { + -0.726813f, -0.026748f, 1.376946f, 1.467961f, 1.961810f, 1.690412f, + 0.596484f, -0.261486f, -0.310905f, -0.366311f, -1.300086f, -0.534336f, + 0.040520f, -0.032391f, -1.194214f, 2.438063f, -3.915334f, 1.997270f, + 0.673696f, -0.676393f, 1.654886f, 1.553838f, 1.129691f, 1.360201f, + 0.255001f, 0.336442f, -0.487759f, -0.634555f, 0.479170f, -0.110475f, + -0.661852f, -0.158872f, -0.350243f, -0.303957f, -0.045018f, 0.586151f, + -0.262463f, 0.228079f, -1.688776f, -1.594502f, -2.261078f, -1.802535f, + 0.034748f, -0.028476f, 2.713258f, 0.212446f, -1.529202f, -2.560178f, +}; + +static const float vp9_var_rd_part_nn_bias_16_layer0[8] = { + 0.495983f, 1.858545f, 0.162974f, 1.992247f, + -2.698863f, 0.110020f, 0.550830f, 0.420941f, +}; + +static const float vp9_var_rd_part_nn_weights_16_layer1[8 * LABELS] = { + 1.768409f, -1.394240f, 1.076846f, -1.762808f, + 1.517405f, 0.535195f, -0.426827f, 1.002272f, +}; + +static const float vp9_var_rd_part_nn_bias_16_layer1[LABELS] = { + -1.65894794f, +}; + +static const NN_CONFIG vp9_var_rd_part_nnconfig_16 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_rd_part_nn_weights_16_layer0, + vp9_var_rd_part_nn_weights_16_layer1, + }, + { + vp9_var_rd_part_nn_bias_16_layer0, + vp9_var_rd_part_nn_bias_16_layer1, + }, +}; + +static const float vp9_var_rd_part_nn_weights_8_layer0[FEATURES * 8] = { + -0.804900f, -1.214983f, 0.840202f, 0.686566f, 0.155804f, 0.025542f, + -1.244635f, -0.368403f, 0.364150f, 1.081073f, 0.552387f, 0.452715f, + 0.652968f, -0.293058f, 0.048967f, 0.021240f, -0.662981f, 0.424700f, + 0.008293f, -0.013088f, 0.747007f, -1.453907f, -1.498226f, 1.593252f, + -0.239557f, -0.143766f, 0.064311f, 1.320998f, -0.477411f, 0.026374f, + 0.730884f, -0.675124f, 0.965521f, 0.863658f, 0.809186f, 0.812280f, + 0.513131f, 0.185102f, 0.211354f, 0.793666f, 0.121714f, -0.015383f, + -0.650980f, -0.046581f, 0.911141f, 0.806319f, 0.974773f, 0.815893f, +}; + +static const float vp9_var_rd_part_nn_bias_8_layer0[8] = { + 0.176134f, 0.651308f, 2.007761f, 0.068812f, + 1.061517f, 1.487161f, -2.308147f, 1.099828f, +}; + +static const float vp9_var_rd_part_nn_weights_8_layer1[8 * LABELS] = { + 0.683032f, 1.326393f, -1.661539f, 1.438920f, + 1.118023f, -2.237380f, 1.518468f, 2.010416f, +}; + +static const float vp9_var_rd_part_nn_bias_8_layer1[LABELS] = { + -1.65423989f, +}; + +static const NN_CONFIG vp9_var_rd_part_nnconfig_8 = { + FEATURES, // num_inputs + LABELS, // num_outputs + 1, // num_hidden_layers + { + 8, + }, // num_hidden_nodes + { + vp9_var_rd_part_nn_weights_8_layer0, + vp9_var_rd_part_nn_weights_8_layer1, + }, + { + vp9_var_rd_part_nn_bias_8_layer0, + vp9_var_rd_part_nn_bias_8_layer1, + }, +}; +#undef FEATURES +#undef LABELS + // Partition pruning model(linear). static const float vp9_partition_feature_mean[24] = { 303501.697372f, 3042630.372158f, 24.694696f, 1.392182f, diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index edcc85c37..44909239d 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -219,6 +219,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->less_rectangular_check = 1; sf->use_square_partition_only = !boosted; sf->prune_ref_frame_for_rect_partitions = 1; + sf->ml_var_partition_pruning = 1; sf->ml_prune_rect_partition_threhold[0] = -1; sf->ml_prune_rect_partition_threhold[1] = 350; @@ -241,6 +242,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, if (speed >= 1) { sf->enable_tpl_model = 0; + sf->ml_var_partition_pruning = 0; sf->ml_prune_rect_partition_threhold[1] = 200; sf->ml_prune_rect_partition_threhold[2] = 200; sf->ml_prune_rect_partition_threhold[3] = 200; @@ -939,6 +941,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->ml_prune_rect_partition_threhold[1] = -1; sf->ml_prune_rect_partition_threhold[2] = -1; sf->ml_prune_rect_partition_threhold[3] = -1; + sf->ml_var_partition_pruning = 0; // Some speed-up features even for best quality as minimal impact on quality. sf->adaptive_rd_thresh = 1; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index acac39835..a895ed235 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -515,6 +515,10 @@ typedef struct SPEED_FEATURES { // Machine-learning based partition search early termination int ml_partition_search_early_termination; + // Machine-learning based partition search pruning using prediction residue + // variance. + int ml_var_partition_pruning; + // Allow skipping partition search for still image frame int allow_partition_search_skip; -- 2.40.0