From 7195ded2c5a8be2c343ce0372d6bc4d2c8389962 Mon Sep 17 00:00:00 2001
From: Marco Paniconi <marpan@google.com>
Date: Mon, 19 Nov 2018 14:13:48 -0800
Subject: [PATCH] vp9-svc: Reset temporal layers on scene change

Reuse existing function for resetting temporal
layer pattern.

And fix to use first spatial layer to encode, and
some refactoring in encode_without_recode_loop().

Change-Id: Ifb22bb9de793ecb8e73f410e125c7c12383da1d2
---
 vp9/encoder/vp9_encoder.c          | 77 +++++++++++++++++-------------
 vp9/encoder/vp9_ratectrl.c         |  8 ++--
 vp9/encoder/vp9_svc_layercontext.c |  7 +--
 vp9/encoder/vp9_svc_layercontext.h |  2 +-
 4 files changed, 54 insertions(+), 40 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 976ba020d..7ced82186 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -3700,15 +3700,16 @@ static INLINE void set_raw_source_frame(VP9_COMP *cpi) {
 static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
                                       uint8_t *dest) {
   VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
   int q = 0, bottom_index = 0, top_index = 0;
   int no_drop_scene_change = 0;
   const INTERP_FILTER filter_scaler =
       (is_one_pass_cbr_svc(cpi))
-          ? cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id]
+          ? svc->downsample_filter_type[svc->spatial_layer_id]
           : EIGHTTAP;
   const int phase_scaler =
       (is_one_pass_cbr_svc(cpi))
-          ? cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id]
+          ? svc->downsample_filter_phase[svc->spatial_layer_id]
           : 0;
 
   if (cm->show_existing_frame) {
@@ -3716,8 +3717,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     return 1;
   }
 
-  cpi->svc.time_stamp_prev[cpi->svc.spatial_layer_id] =
-      cpi->svc.time_stamp_superframe;
+  svc->time_stamp_prev[svc->spatial_layer_id] = svc->time_stamp_superframe;
 
   // Flag to check if its valid to compute the source sad (used for
   // scene detection and for superblock content state in CBR mode).
@@ -3731,25 +3731,25 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   if (is_one_pass_cbr_svc(cpi) &&
       cpi->un_scaled_source->y_width == cm->width << 2 &&
       cpi->un_scaled_source->y_height == cm->height << 2 &&
-      cpi->svc.scaled_temp.y_width == cm->width << 1 &&
-      cpi->svc.scaled_temp.y_height == cm->height << 1) {
+      svc->scaled_temp.y_width == cm->width << 1 &&
+      svc->scaled_temp.y_height == cm->height << 1) {
     // For svc, if it is a 1/4x1/4 downscaling, do a two-stage scaling to take
     // advantage of the 1:2 optimized scaler. In the process, the 1/2x1/2
     // result will be saved in scaled_temp and might be used later.
-    const INTERP_FILTER filter_scaler2 = cpi->svc.downsample_filter_type[1];
-    const int phase_scaler2 = cpi->svc.downsample_filter_phase[1];
+    const INTERP_FILTER filter_scaler2 = svc->downsample_filter_type[1];
+    const int phase_scaler2 = svc->downsample_filter_phase[1];
     cpi->Source = vp9_svc_twostage_scale(
-        cm, cpi->un_scaled_source, &cpi->scaled_source, &cpi->svc.scaled_temp,
+        cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp,
         filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
-    cpi->svc.scaled_one_half = 1;
+    svc->scaled_one_half = 1;
   } else if (is_one_pass_cbr_svc(cpi) &&
              cpi->un_scaled_source->y_width == cm->width << 1 &&
              cpi->un_scaled_source->y_height == cm->height << 1 &&
-             cpi->svc.scaled_one_half) {
+             svc->scaled_one_half) {
     // If the spatial layer is 1/2x1/2 and the scaling is already done in the
     // two-stage scaling, use the result directly.
-    cpi->Source = &cpi->svc.scaled_temp;
-    cpi->svc.scaled_one_half = 0;
+    cpi->Source = &svc->scaled_temp;
+    svc->scaled_one_half = 0;
   } else {
     cpi->Source = vp9_scale_if_required(
         cm, cpi->un_scaled_source, &cpi->scaled_source, (cpi->oxcf.pass == 0),
@@ -3757,8 +3757,8 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   }
 #ifdef OUTPUT_YUV_SVC_SRC
   // Write out at most 3 spatial layers.
-  if (is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id < 3) {
-    vpx_write_yuv_frame(yuv_svc_src[cpi->svc.spatial_layer_id], cpi->Source);
+  if (is_one_pass_cbr_svc(cpi) && svc->spatial_layer_id < 3) {
+    vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source);
   }
 #endif
   // Unfiltered raw source used in metrics calculation if the source
@@ -3778,9 +3778,9 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   }
 
   if ((cpi->use_svc &&
-       (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1 ||
-        cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 ||
-        cpi->svc.current_superframe < 1)) ||
+       (svc->spatial_layer_id < svc->number_spatial_layers - 1 ||
+        svc->temporal_layer_id < svc->number_temporal_layers - 1 ||
+        svc->current_superframe < 1)) ||
       cpi->resize_pending || cpi->resize_state || cpi->external_resize ||
       cpi->resize_state != ORIG) {
     cpi->compute_source_sad_onepass = 0;
@@ -3829,20 +3829,31 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
        (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8)))
     vp9_scene_detection_onepass(cpi);
 
-  if (cpi->svc.spatial_layer_id == 0)
-    cpi->svc.high_source_sad_superframe = cpi->rc.high_source_sad;
+  if (svc->spatial_layer_id == svc->first_spatial_layer_to_encode) {
+    svc->high_source_sad_superframe = cpi->rc.high_source_sad;
+    // On scene change reset temporal layer pattern to TL0.
+    // TODO(marpan/jianj): Fix this to handle case where base
+    // spatial layers are skipped, in which case we should insert
+    // and reset to spatial layer 0 on scene change.
+    if (svc->high_source_sad_superframe && svc->temporal_layer_id > 0) {
+      // rc->high_source_sad will get reset so copy it to restore it.
+      int tmp_high_source_sad = cpi->rc.high_source_sad;
+      vp9_svc_reset_temporal_layers(cpi, cm->frame_type == KEY_FRAME);
+      cpi->rc.high_source_sad = tmp_high_source_sad;
+    }
+  }
 
   // For 1 pass CBR, check if we are dropping this frame.
   // Never drop on key frame, if base layer is key for svc,
   // on scene change, or if superframe has layer sync.
-  if ((cpi->rc.high_source_sad || cpi->svc.high_source_sad_superframe) &&
-      !(cpi->rc.use_post_encode_drop && cpi->svc.last_layer_dropped[0]))
+  if ((cpi->rc.high_source_sad || svc->high_source_sad_superframe) &&
+      !(cpi->rc.use_post_encode_drop && svc->last_layer_dropped[0]))
     no_drop_scene_change = 1;
   if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
       !frame_is_intra_only(cm) && !no_drop_scene_change &&
-      !cpi->svc.superframe_has_layer_sync &&
+      !svc->superframe_has_layer_sync &&
       (!cpi->use_svc ||
-       !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)) {
+       !svc->layer_context[svc->temporal_layer_id].is_key_frame)) {
     if (vp9_rc_drop_frame(cpi)) return 0;
   }
 
@@ -3850,7 +3861,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can
   // avoid this frame-level upsampling (for non intra_only frames).
   if (frame_is_intra_only(cm) == 0 &&
-      !(is_one_pass_cbr_svc(cpi) && cpi->svc.force_zero_mode_spatial_ref)) {
+      !(is_one_pass_cbr_svc(cpi) && svc->force_zero_mode_spatial_ref)) {
     vp9_scale_references(cpi);
   }
 
@@ -3860,12 +3871,12 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi);
 
   if (cpi->sf.svc_use_lowres_part &&
-      cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) {
-    if (cpi->svc.prev_partition_svc == NULL) {
+      svc->spatial_layer_id == svc->number_spatial_layers - 2) {
+    if (svc->prev_partition_svc == NULL) {
       CHECK_MEM_ERROR(
-          cm, cpi->svc.prev_partition_svc,
+          cm, svc->prev_partition_svc,
           (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
-                                   sizeof(*cpi->svc.prev_partition_svc)));
+                                   sizeof(*svc->prev_partition_svc)));
     }
   }
 
@@ -3893,13 +3904,13 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   if (cpi->use_svc) {
     // On non-zero spatial layer, check for disabling inter-layer
     // prediction.
-    if (cpi->svc.spatial_layer_id > 0) vp9_svc_constrain_inter_layer_pred(cpi);
+    if (svc->spatial_layer_id > 0) vp9_svc_constrain_inter_layer_pred(cpi);
     vp9_svc_assert_constraints_pattern(cpi);
   }
 
   if (cpi->rc.last_post_encode_dropped_scene_change) {
     cpi->rc.high_source_sad = 1;
-    cpi->svc.high_source_sad_superframe = 1;
+    svc->high_source_sad_superframe = 1;
     // For now disable use_source_sad since Last_Source will not be the previous
     // encoded but the dropped one.
     cpi->sf.use_source_sad = 0;
@@ -3910,7 +3921,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   // control parameters.
   if (cpi->sf.overshoot_detection_cbr_rt == FAST_DETECTION_MAXQ &&
       (cpi->rc.high_source_sad ||
-       (cpi->use_svc && cpi->svc.high_source_sad_superframe))) {
+       (cpi->use_svc && svc->high_source_sad_superframe))) {
     if (vp9_encodedframe_overshoot(cpi, -1, &q)) {
       vp9_set_quantizer(cm, q);
       vp9_set_variance_partition_thresholds(cpi, q, 0);
@@ -3945,7 +3956,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   // For SVC: all spatial layers are checked for re-encoding.
   if (cpi->sf.overshoot_detection_cbr_rt == RE_ENCODE_MAXQ &&
       (cpi->rc.high_source_sad ||
-       (cpi->use_svc && cpi->svc.high_source_sad_superframe))) {
+       (cpi->use_svc && svc->high_source_sad_superframe))) {
     int frame_size = 0;
     // Get an estimate of the encoded frame size.
     save_coding_context(cpi);
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index bb316fe11..779529b24 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -2122,7 +2122,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
     if (is_one_pass_cbr_svc(cpi)) {
-      if (cm->current_video_frame > 0) vp9_svc_reset_key_frame(cpi);
+      if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1);
       layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
                                svc->number_temporal_layers);
       svc->layer_context[layer].is_key_frame = 1;
@@ -2750,8 +2750,10 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
 #endif
   rc->high_source_sad = 0;
   rc->high_num_blocks_with_motion = 0;
-  if (cpi->svc.spatial_layer_id == 0 && src_width == last_src_width &&
-      src_height == last_src_height) {
+  // For SVC: scene detection is only checked on first spatial layer of
+  // the superframe using the original/unscaled resolutions.
+  if (cpi->svc.spatial_layer_id == cpi->svc.first_spatial_layer_to_encode &&
+      src_width == last_src_width && src_height == last_src_height) {
     YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS] = { NULL };
     int num_mi_cols = cm->mi_cols;
     int num_mi_rows = cm->mi_rows;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 7cab7ffb2..033684e2e 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -932,7 +932,7 @@ void vp9_free_svc_cyclic_refresh(VP9_COMP *const cpi) {
 }
 
 // Reset on key frame: reset counters, references and buffer updates.
-void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
+void vp9_svc_reset_temporal_layers(VP9_COMP *const cpi, int is_key) {
   int sl, tl;
   SVC *const svc = &cpi->svc;
   LAYER_CONTEXT *lc = NULL;
@@ -940,7 +940,7 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
     for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
       lc = &cpi->svc.layer_context[sl * svc->number_temporal_layers + tl];
       lc->current_video_frame_in_layer = 0;
-      lc->frames_from_key_frame = 0;
+      if (is_key) lc->frames_from_key_frame = 0;
     }
   }
   if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
@@ -1108,7 +1108,8 @@ void vp9_svc_check_spatial_layer_sync(VP9_COMP *const cpi) {
     if (svc->spatial_layer_id == 0) {
       // On base spatial layer: if the current superframe has a layer sync then
       // reset the pattern counters and reset to base temporal layer.
-      if (svc->superframe_has_layer_sync) vp9_svc_reset_key_frame(cpi);
+      if (svc->superframe_has_layer_sync)
+        vp9_svc_reset_temporal_layers(cpi, cpi->common.frame_type == KEY_FRAME);
     }
     // If the layer sync is set for this current spatial layer then
     // disable the temporal reference.
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index a114a9188..17d7a8a50 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -237,7 +237,7 @@ int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
 
 void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
 
-void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);
+void vp9_svc_reset_temporal_layers(struct VP9_COMP *const cpi, int is_key);
 
 void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
 
-- 
2.40.0