From c4978abc07d04884ec5aeb204e946a9d170bee92 Mon Sep 17 00:00:00 2001
From: sdeng <sdeng@google.com>
Date: Wed, 24 Oct 2018 16:23:24 -0700
Subject: [PATCH] Enable 10 bit tpl support

         lowres_bd10   midres_bd10
avg_psnr      -0.897        -1.261
ovr_psnr      -0.975        -1.349

Change-Id: Id54f2c419f4edaa91e89ffea52b4038b1d94e563
---
 vp9/encoder/vp9_encoder.c        | 51 ++++++++++++++++++++++++++++----
 vp9/encoder/vp9_speed_features.c |  5 ----
 2 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index c189fbd2c..56a423e6c 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -5773,9 +5773,21 @@ void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
   int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
   const int shift = tx_size == TX_32X32 ? 0 : 2;
 
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp,
+                                 p->quant_fp, qcoeff, dqcoeff, pd->dequant,
+                                 &eob, scan_order->scan, scan_order->iscan);
+  } else {
+    vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp,
+                          p->quant_fp, qcoeff, dqcoeff, pd->dequant, &eob,
+                          scan_order->scan, scan_order->iscan);
+  }
+#else
   vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, p->quant_fp,
                         qcoeff, dqcoeff, pd->dequant, &eob, scan_order->scan,
                         scan_order->iscan);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
   *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
   *recon_error = VPXMAX(*recon_error, 1);
@@ -5784,6 +5796,19 @@ void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
   *sse = VPXMAX(*sse, 1);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                         TX_SIZE tx_size) {
+  // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
+  switch (tx_size) {
+    case TX_8X8: vpx_hadamard_8x8_c(src_diff, bw, coeff); break;
+    case TX_16X16: vpx_hadamard_16x16_c(src_diff, bw, coeff); break;
+    case TX_32X32: vpx_hadamard_32x32_c(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
                   TX_SIZE tx_size) {
   switch (tx_size) {
@@ -5883,11 +5908,24 @@ void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
     vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
                             src_stride, dst, dst_stride, 0, 0, 0);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                                dst_stride, xd->bd);
+      highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      // TODO(sdeng): Implement SIMD based high bit-depth satd.
+      intra_cost = vpx_satd_c(coeff, pix_num);
+    } else {
+      vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                         dst_stride);
+      wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      intra_cost = vpx_satd(coeff, pix_num);
+    }
+#else
     vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
-
     wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-
     intra_cost = vpx_satd(coeff, pix_num);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
   }
@@ -5911,8 +5949,6 @@ void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
         mi_row, mi_col, &mv.as_mv);
 #endif
 
-    // TODO(jingning): Not yet support high bit-depth in the next three
-    // steps.
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       vp9_highbd_build_inter_predictor(
@@ -5923,6 +5959,8 @@ void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
       vpx_highbd_subtract_block(
           bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
           xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
+      highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      inter_cost = vpx_satd_c(coeff, pix_num);
     } else {
       vp9_build_inter_predictor(
           ref_frame[rf_idx]->y_buffer + mb_y_offset,
@@ -5931,6 +5969,8 @@ void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
       vpx_subtract_block(bh, bw, src_diff, bw,
                          xd->cur_buf->y_buffer + mb_y_offset,
                          xd->cur_buf->y_stride, &predictor[0], bw);
+      wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      inter_cost = vpx_satd(coeff, pix_num);
     }
 #else
     vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
@@ -5940,10 +5980,9 @@ void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
     vpx_subtract_block(bh, bw, src_diff, bw,
                        xd->cur_buf->y_buffer + mb_y_offset,
                        xd->cur_buf->y_stride, &predictor[0], bw);
-#endif
     wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-
     inter_cost = vpx_satd(coeff, pix_num);
+#endif
 
 #if CONFIG_NON_GREEDY_MV
     tpl_stats->inter_cost_arr[rf_idx] = inter_cost;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index db064d3df..1f9044265 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -902,12 +902,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   sf->allow_quant_coeff_opt = sf->optimize_coefficients;
   sf->quant_opt_thresh = 99.0;
   sf->allow_acl = 1;
-#if CONFIG_VP9_HIGHBITDEPTH
-  // TODO(jingning): Make the model support high bit-depth route.
-  sf->enable_tpl_model = !cm->use_highbitdepth && oxcf->enable_tpl_model;
-#else
   sf->enable_tpl_model = oxcf->enable_tpl_model;
-#endif
   sf->prune_ref_frame_for_rect_partitions = 0;
 
   for (i = 0; i < TX_SIZES; i++) {
-- 
2.40.0