From: Yunqing Wang <yunqingwang@google.com>
Date: Mon, 27 Feb 2017 22:26:15 +0000 (-0800)
Subject: Apply machine learning-based early termination in VP9 partition search
X-Git-Tag: v1.7.0~636^2
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=670101439fe4a976fcacf997ff383b6cd6704596;p=libvpx

Apply machine learning-based early termination in VP9 partition search

This patch was based on Yang Xian's intern project code. Further modifications
were done.
1. Moved machine-learning related parameters into the context structure.
2. Corrected the calculation of sum_eobs.
3. Removed unused parameters and calculations.
4. Made it work with multiple tiles.
5. Added a speed feature for the machine-learning based partition search
early termination.
6. Re-organized the code.

The patch was rebased to the top-of-tree.

Borg test BDRATE result:
4k set:     PSNR: +0.144%; SSIM: +0.043%;
hdres set:  PSNR: +0.149%; SSIM: +0.269%;
midres set: PSNR: +0.127%; SSIM: +0.257%;

Average speed gain result:
4k clips: 22%;
hd clips: 23%;
midres clips: 15%.

Change-Id: I0220e93a8277e6a7ea4b2c34b605966e3b1584ac
---

diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h
index 86ba03d69..af1a93a00 100644
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -71,6 +71,9 @@ typedef struct {
   // search loop
   MV pred_mv[MAX_REF_FRAMES];
   INTERP_FILTER pred_interp_filter;
+
+  // Used for the machine learning-based early termination
+  int sum_eobs;
 } PICK_MODE_CONTEXT;
 
 typedef struct PC_TREE {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 2d530a716..df72667d4 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -52,6 +52,33 @@ static void encode_superblock(VP9_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
                               int output_enabled, int mi_row, int mi_col,
                               BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx);
 
+// Machine learning-based early termination parameters.
+static const double train_mean[24] = {
+  303501.697372, 3042630.372158, 24.694696, 1.392182,
+  689.413511,    162.027012,     1.478213,  0.0,
+  135382.260230, 912738.513263,  28.845217, 1.515230,
+  544.158492,    131.807995,     1.436863,  0.0,
+  43682.377587,  208131.711766,  28.084737, 1.356677,
+  138.254122,    119.522553,     1.252322,  0.0
+};
+
+static const double train_stdm[24] = {
+  673689.212982, 5996652.516628, 0.024449, 1.989792,
+  985.880847,    0.014638,       2.001898, 0.0,
+  208798.775332, 1812548.443284, 0.018693, 1.838009,
+  396.986910,    0.015657,       1.332541, 0.0,
+  55888.847031,  448587.962714,  0.017900, 1.904776,
+  98.652832,     0.016598,       1.320992, 0.0
+};
+
+// Error tolerance: 0.01%-0.0.05%-0.1%
+static const double classifiers[24] = {
+  0.111736, 0.289977, 0.042219, 0.204765, 0.120410, -0.143863,
+  0.282376, 0.847811, 0.637161, 0.131570, 0.018636, 0.202134,
+  0.112797, 0.028162, 0.182450, 1.124367, 0.386133, 0.083700,
+  0.050028, 0.150873, 0.061119, 0.109318, 0.127255, 0.625211
+};
+
 // This is used as a reference when computing the source variance for the
 //  purpose of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
@@ -2684,6 +2711,18 @@ static INLINE int get_motion_inconsistency(MOTION_DIRECTION this_mv,
 }
 #endif
 
+// Accumulate all tx blocks' eobs results got from the partition evaluation.
+static void accumulate_eobs(int plane, int block, int row, int col,
+                            BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                            void *arg) {
+  PICK_MODE_CONTEXT *ctx = (PICK_MODE_CONTEXT *)arg;
+  (void)row;
+  (void)col;
+  (void)plane_bsize;
+  (void)tx_size;
+  ctx->sum_eobs += ctx->eobs_pbuf[plane][1][block];
+}
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
@@ -2863,15 +2902,92 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
         best_rdc = this_rdc;
         if (bsize >= BLOCK_8X8) pc_tree->partitioning = PARTITION_NONE;
 
-        // If all y, u, v transform blocks in this partition are skippable, and
-        // the dist & rate are within the thresholds, the partition search is
-        // terminated for current branch of the partition search tree.
-        if (!x->e_mbd.lossless && ctx->skippable &&
-            ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
-             (best_rdc.dist < dist_breakout_thr &&
-              best_rdc.rate < rate_breakout_thr))) {
-          do_split = 0;
-          do_rect = 0;
+        if (!cpi->sf.ml_partition_search_early_termination) {
+          // If all y, u, v transform blocks in this partition are skippable,
+          // and the dist & rate are within the thresholds, the partition search
+          // is terminated for current branch of the partition search tree.
+          if (!x->e_mbd.lossless && ctx->skippable &&
+              ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+               (best_rdc.dist < dist_breakout_thr &&
+                best_rdc.rate < rate_breakout_thr))) {
+            do_split = 0;
+            do_rect = 0;
+          }
+        } else {
+          // Currently, the machine-learning based partition search early
+          // termination is only used while bsize is 16x16, 32x32 or 64x64,
+          // VPXMIN(cm->width, cm->height) >= 480, and speed = 0.
+          if (ctx->mic.mode >= INTRA_MODES && bsize >= BLOCK_16X16) {
+            const double *clf;
+            const double *mean;
+            const double *sd;
+            const int mag_mv =
+                abs(ctx->mic.mv[0].as_mv.col) + abs(ctx->mic.mv[0].as_mv.row);
+            const int left_in_image = !!xd->left_mi;
+            const int above_in_image = !!xd->above_mi;
+            MODE_INFO **prev_mi =
+                &cm->prev_mi_grid_visible[mi_col + cm->mi_stride * mi_row];
+            int above_par = 0;  // above_partitioning
+            int left_par = 0;   // left_partitioning
+            int last_par = 0;   // last_partitioning
+            BLOCK_SIZE context_size;
+            double score;
+            int offset = 0;
+
+            assert(b_width_log2_lookup[bsize] == b_height_log2_lookup[bsize]);
+
+            ctx->sum_eobs = 0;
+            vp9_foreach_transformed_block_in_plane(xd, bsize, 0,
+                                                   accumulate_eobs, ctx);
+
+            if (above_in_image) {
+              context_size = xd->above_mi->sb_type;
+              if (context_size < bsize)
+                above_par = 2;
+              else if (context_size == bsize)
+                above_par = 1;
+            }
+
+            if (left_in_image) {
+              context_size = xd->left_mi->sb_type;
+              if (context_size < bsize)
+                left_par = 2;
+              else if (context_size == bsize)
+                left_par = 1;
+            }
+
+            if (prev_mi) {
+              context_size = prev_mi[0]->sb_type;
+              if (context_size < bsize)
+                last_par = 2;
+              else if (context_size == bsize)
+                last_par = 1;
+            }
+
+            if (bsize == BLOCK_64X64)
+              offset = 0;
+            else if (bsize == BLOCK_32X32)
+              offset = 8;
+            else if (bsize == BLOCK_16X16)
+              offset = 16;
+
+            // early termination score calculation
+            clf = &classifiers[offset];
+            mean = &train_mean[offset];
+            sd = &train_stdm[offset];
+            score = clf[0] * (((double)ctx->rate - mean[0]) / sd[0]) +
+                    clf[1] * (((double)ctx->dist - mean[1]) / sd[1]) +
+                    clf[2] * (((double)mag_mv / 2 - mean[2]) * sd[2]) +
+                    clf[3] * (((double)(left_par + above_par) / 2 - mean[3]) *
+                              sd[3]) +
+                    clf[4] * (((double)ctx->sum_eobs - mean[4]) / sd[4]) +
+                    clf[5] * (((double)cm->base_qindex - mean[5]) * sd[5]) +
+                    clf[6] * (((double)last_par - mean[6]) * sd[6]) + clf[7];
+            if (score < 0) {
+              do_split = 0;
+              do_rect = 0;
+            }
+          }
         }
 
 #if CONFIG_FP_MB_STATS
@@ -2984,7 +3100,8 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
         pc_tree->partitioning = PARTITION_SPLIT;
 
         // Rate and distortion based partition search termination clause.
-        if (!x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+        if (!cpi->sf.ml_partition_search_early_termination &&
+            !x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
                                    (best_rdc.dist < dist_breakout_thr &&
                                     best_rdc.rate < rate_breakout_thr))) {
           do_rect = 0;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index dfe132cbd..859d4eac0 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -71,7 +71,15 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
   sf->partition_search_breakout_thr.dist = (1 << 20);
   sf->partition_search_breakout_thr.rate = 80;
 
+  // Currently, the machine-learning based partition search early termination
+  // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
+  if (VPXMIN(cm->width, cm->height) >= 480) {
+    sf->ml_partition_search_early_termination = 1;
+  }
+
   if (speed >= 1) {
+    sf->ml_partition_search_early_termination = 0;
+
     if (VPXMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask =
           cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
@@ -586,6 +594,7 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
   // Some speed-up features even for best quality as minimal impact on quality.
   sf->partition_search_breakout_thr.dist = (1 << 19);
   sf->partition_search_breakout_thr.rate = 80;
+  sf->ml_partition_search_early_termination = 0;
 
   if (oxcf->mode == REALTIME) {
     set_rt_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index b04cd872c..96898eea3 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -449,6 +449,9 @@ typedef struct SPEED_FEATURES {
   // Partition search early breakout thresholds.
   PARTITION_SEARCH_BREAKOUT_THR partition_search_breakout_thr;
 
+  // Machine-learning based partition search early termination
+  int ml_partition_search_early_termination;
+
   // Allow skipping partition search for still image frame
   int allow_partition_search_skip;