From 97848890a99af6604df13ec38c094a01aea40f0e Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Wed, 8 Aug 2018 14:01:26 -0700
Subject: [PATCH] vp9: Allow for overshoot detection for non-screen CBR mode.

For CBR real-time mode: refactor usage of speed feature to
handle overshoot on slide/scene change. Add 2 modes to indicate
how slide/scene change is processed for re-setting Q/rate control.
Keep the speed setting to 1 for speed >= 5, otherwise set to 0.

Video content and screen content are now handled in similar way,
though with different thresholds.

Some fixes to thresholds and reset: correct the reset of the buffer
level to optimal level for each temporal layer, if scene change
frame will be encoded at max_q.

Also increase the min_thresh for video mode (non-screen content):
this is to avoid scene change detection on cases like large
lighting changes, cameras focus. And increase in min_thresh
makes it more robust to sudden increase in noise level.

Change-Id: I256d350da6e92d2ddc09f100fc06ac147cbc1e49
---
 vp9/encoder/vp9_encoder.c        | 16 ++++++++--------
 vp9/encoder/vp9_ratectrl.c       | 26 +++++++++++++-------------
 vp9/encoder/vp9_speed_features.c | 11 ++++-------
 vp9/encoder/vp9_speed_features.h | 14 +++++++++++---
 4 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 3db11fcb0..335a0b292 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -4052,13 +4052,12 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     vp9_svc_assert_constraints_pattern(cpi);
   }
 
-  if (!cpi->sf.re_encode_overshoot_rt &&
-      cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+  // Check if this high_source_sad (scene/slide change) frame should be
+  // encoded at high/max QP, and if so, set the q and adjust some rate
+  // control parameters.
+  if (cpi->sf.overshoot_detection_rt == 1 &&
       (cpi->rc.high_source_sad ||
        (cpi->use_svc && cpi->svc.high_source_sad_superframe))) {
-    // Check if this high_source_sad (scene/slide change) frame should be
-    // encoded at high/max QP, and if so, set the q and adjust some rate
-    // control parameters.
     if (vp9_encodedframe_overshoot(cpi, -1, &q)) {
       vp9_set_quantizer(cm, q);
       vp9_set_variance_partition_thresholds(cpi, q, 0);
@@ -4087,10 +4086,11 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   vp9_encode_frame(cpi);
 
-  // Check if we should drop this frame because of high overshoot.
-  // Only for frames where high temporal-source SAD is detected.
+  // Check if we should re-encode this frame at high Q because of high
+  // overshoot based on the encoded frame size. Only for frames where
+  // high temporal-source SAD is detected.
   // For SVC: all spatial layers are checked for re-encoding.
-  if (cpi->sf.re_encode_overshoot_rt &&
+  if (cpi->sf.overshoot_detection_rt == 2 &&
       (cpi->rc.high_source_sad ||
        (cpi->use_svc && cpi->svc.high_source_sad_superframe))) {
     int frame_size = 0;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index e0c877833..64db18f09 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -2648,10 +2648,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
     float thresh = 8.0f;
     uint32_t thresh_key = 140000;
     if (cpi->oxcf.speed <= 5) thresh_key = 240000;
-    if (cpi->oxcf.rc_mode == VPX_VBR) {
-      min_thresh = 65000;
-      thresh = 2.1f;
-    }
+    if (cpi->oxcf.content != VP9E_CONTENT_SCREEN) min_thresh = 65000;
+    if (cpi->oxcf.rc_mode == VPX_VBR) thresh = 2.1f;
     if (cpi->use_svc && cpi->svc.number_spatial_layers > 1) {
       const int aligned_width = ALIGN_POWER_OF_TWO(src_width, MI_SIZE_LOG2);
       const int aligned_height = ALIGN_POWER_OF_TWO(src_height, MI_SIZE_LOG2);
@@ -2822,14 +2820,16 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
   SPEED_FEATURES *const sf = &cpi->sf;
   int thresh_qp = 7 * (rc->worst_quality >> 3);
   int thresh_rate = rc->avg_frame_bandwidth << 3;
-  // Lower rate threshold for video.
+  // Lower thresh_qp for video (more overshoot at lower Q) to be
+  // more conservative for video.
   if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
-    thresh_rate = rc->avg_frame_bandwidth << 2;
+    thresh_qp = rc->worst_quality >> 1;
   // If this decision is not based on an encoded frame size but just on
-  // scene/slide change detection (i.e., re_encode_overshoot_rt = 0), adjust the
-  // qp_thresh and skip the (frame_size > thresh_rate) condition in this case.
-  if (!sf->re_encode_overshoot_rt) thresh_qp = 3 * (rc->worst_quality >> 2);
-  if ((!sf->re_encode_overshoot_rt || frame_size > thresh_rate) &&
+  // scene/slide change detection (i.e., re_encode_overshoot_rt = 1),
+  // for now skip the (frame_size > thresh_rate) condition in this case.
+  // TODO(marpan): Use a better size/rate condition for this case and
+  // adjust thresholds.
+  if ((sf->overshoot_detection_rt == 1 || frame_size > thresh_rate) &&
       cm->base_qindex < thresh_qp) {
     double rate_correction_factor =
         cpi->rc.rate_correction_factors[INTER_NORMAL];
@@ -2846,7 +2846,7 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
     // and the encoded frame used alot of Intra modes, then force hybrid_intra
     // encoding for the re-encode on this scene change. hybrid_intra will
     // use rd-based intra mode selection for small blocks.
-    if (sf->re_encode_overshoot_rt && frame_size > (thresh_rate << 1) &&
+    if (sf->overshoot_detection_rt == 2 && frame_size > (thresh_rate << 1) &&
         cpi->svc.spatial_layer_id == 0) {
       MODE_INFO **mi = cm->mi_grid_visible;
       int sum_intra_usage = 0;
@@ -2900,8 +2900,8 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
         LAYER_CONTEXT *lc = &svc->layer_context[layer];
         RATE_CONTROL *lrc = &lc->rc;
         lrc->avg_frame_qindex[INTER_FRAME] = *q;
-        lrc->buffer_level = rc->optimal_buffer_level;
-        lrc->bits_off_target = rc->optimal_buffer_level;
+        lrc->buffer_level = lrc->optimal_buffer_level;
+        lrc->bits_off_target = lrc->optimal_buffer_level;
         lrc->rc_1_frame = 0;
         lrc->rc_2_frame = 0;
         lrc->rate_correction_factors[INTER_NORMAL] = rate_correction_factor;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 859e626bb..107da2194 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -405,7 +405,7 @@ static void set_rt_speed_feature_framesize_independent(
   sf->use_compound_nonrd_pickmode = 0;
   sf->nonrd_keyframe = 0;
   sf->svc_use_lowres_part = 0;
-  sf->re_encode_overshoot_rt = 0;
+  sf->overshoot_detection_rt = 0;
   sf->disable_16x16part_nonkey = 0;
   sf->disable_golden_ref = 0;
   sf->enable_tpl_model = 0;
@@ -570,11 +570,9 @@ static void set_rt_speed_feature_framesize_independent(
     // Keep nonrd_keyframe = 1 for non-base spatial layers to prevent
     // increase in encoding time.
     if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) sf->nonrd_keyframe = 1;
-    if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
-        cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG &&
-        (cpi->use_svc || cpi->oxcf.content == VP9E_CONTENT_SCREEN)) {
-      sf->re_encode_overshoot_rt = 1;
-    }
+    if (cm->frame_type != KEY_FRAME && cpi->resize_state == ORIG &&
+        cpi->oxcf.rc_mode == VPX_CBR)
+      sf->overshoot_detection_rt = 1;
     if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
         cm->width <= 1280 && cm->height <= 720) {
       sf->use_altref_onepass = 1;
@@ -583,7 +581,6 @@ static void set_rt_speed_feature_framesize_independent(
   }
 
   if (speed >= 6) {
-    sf->re_encode_overshoot_rt = 0;
     if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) {
       sf->use_altref_onepass = 1;
       sf->use_compound_nonrd_pickmode = 1;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 89fc82ebf..7430f0a33 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -542,9 +542,17 @@ typedef struct SPEED_FEATURES {
   // For SVC: enables use of partition from lower spatial resolution.
   int svc_use_lowres_part;
 
-  // Enable re-encoding on scene change with potential high overshoot,
-  // for real-time encoding flow.
-  int re_encode_overshoot_rt;
+  // Flag to indicate process for handling overshoot on slide/scene change,
+  // for real-time CBR mode.
+  // 0: no reaction to rate control on a detected slide/scene change
+  // (prior to encoding the frame).
+  // 1: set to larger Q based only on the detected slide/scene change
+  // and current/past Q. No second pass encoding, so faster than option 2.
+  // 2: based on (first pass) encoded frame, if large frame size is detected
+  // then set to higher Q for second encode. This involves 2 pass encoding
+  // on slide change, so slower than 1, but more accurate for detecting
+  // overshoot.
+  int overshoot_detection_rt;
 
   // Disable partitioning of 16x16 blocks.
   int disable_16x16part_nonkey;
-- 
2.40.0