From: Debargha Mukherjee <debargha@google.com>
Date: Tue, 19 Jan 2016 21:01:01 +0000 (-0800)
Subject: Loop restoration filter
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=84ca7a9f0f6ad793c16428d18fa6c5d5a8ae0f37;p=libvpx

Loop restoration filter

Current implementation is a bilateral filter whose
parameters are transmitted in the bitstream.

derflr: -0.647% BDRATE
hevcmr: -0.794% BDRATE

This is a prelimary patch. Various other variations are to
be investigated next, that will hopefully be less expensive
on the decoder side.

Change-Id: I50634ae8f5014ad0bf7432306348908a349d81e1
---

diff --git a/vp10/common/alloccommon.c b/vp10/common/alloccommon.c
index 364afde47..e14aee76f 100644
--- a/vp10/common/alloccommon.c
+++ b/vp10/common/alloccommon.c
@@ -81,6 +81,12 @@ void vp10_free_ref_frame_buffers(BufferPool *pool) {
   }
 }
 
+#if CONFIG_LOOP_RESTORATION
+void vp10_free_restoration_buffers(VP10_COMMON *cm) {
+  vpx_free_frame_buffer(&cm->tmp_loop_buf);
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
 void vp10_free_postproc_buffers(VP10_COMMON *cm) {
 #if CONFIG_VP9_POSTPROC
   vpx_free_frame_buffer(&cm->post_proc_buffer);
diff --git a/vp10/common/alloccommon.h b/vp10/common/alloccommon.h
index 5cfe6602d..f77833b7d 100644
--- a/vp10/common/alloccommon.h
+++ b/vp10/common/alloccommon.h
@@ -29,6 +29,9 @@ void vp10_free_context_buffers(struct VP10Common *cm);
 
 void vp10_free_ref_frame_buffers(struct BufferPool *pool);
 void vp10_free_postproc_buffers(struct VP10Common *cm);
+#if CONFIG_LOOP_RESTORATION
+void vp10_free_restoration_buffers(struct VP10Common *cm);
+#endif  // CONFIG_LOOP_RESTORATION
 
 int vp10_alloc_state_buffers(struct VP10Common *cm, int width, int height);
 void vp10_free_state_buffers(struct VP10Common *cm);
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index 92f00c485..3e1068065 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -1498,6 +1498,10 @@ void vp10_setup_past_independence(VP10_COMMON *cm) {
 
   // To force update of the sharpness
   lf->last_sharpness_level = -1;
+#if CONFIG_LOOP_RESTORATION
+  lf->bilateral_level = 0;
+  lf->last_bilateral_level = 0;
+#endif  // CONFIG_LOOP_RESTORATION
 
   vp10_default_coef_probs(cm);
   init_mode_probs(cm->fc);
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 875030d0a..380312e6c 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
+
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vp10/common/loopfilter.h"
@@ -212,6 +214,250 @@ static const int mode_lf_lut[MB_MODE_COUNT] = {
 #endif  // CONFIG_EXT_INTER
 };
 
+#if CONFIG_LOOP_RESTORATION
+#define BILATERAL_RANGE  256
+#define BILATERAL_RANGE_SYM  (2 * BILATERAL_RANGE + 1)
+static double bilateral_filters_r_kf[BILATERAL_LEVELS_KF + 1]
+                                    [BILATERAL_RANGE_SYM];
+static double bilateral_filters_r[BILATERAL_LEVELS + 1][BILATERAL_RANGE_SYM];
+static double bilateral_filters_s_kf[BILATERAL_LEVELS_KF + 1]
+                                    [BILATERAL_WIN][BILATERAL_WIN];
+static double bilateral_filters_s[BILATERAL_LEVELS + 1]
+                                 [BILATERAL_WIN][BILATERAL_WIN];
+
+void vp10_loop_bilateral_precal() {
+  int i;
+  for (i = 1; i < BILATERAL_LEVELS_KF + 1; i ++) {
+    const bilateral_params_t param = vp10_bilateral_level_to_params(i, 1);
+    const int sigma_x = param.sigma_x;
+    const int sigma_y = param.sigma_y;
+    const int sigma_r = param.sigma_r;
+    const double sigma_r_d = (double)sigma_r / BILATERAL_PRECISION;
+    const double sigma_x_d = (double)sigma_x / BILATERAL_PRECISION;
+    const double sigma_y_d = (double)sigma_y / BILATERAL_PRECISION;
+
+    double *fr = bilateral_filters_r_kf[i] + BILATERAL_RANGE;
+    int j, x, y;
+    for (j = 0; j <= BILATERAL_RANGE; j++) {
+      fr[j] = exp(-(j * j) / (2 * sigma_r_d * sigma_r_d));
+      fr[-j] = fr[j];
+    }
+    for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; y++) {
+      for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; x++) {
+        bilateral_filters_s_kf[i][y + BILATERAL_HALFWIN]
+                                 [x + BILATERAL_HALFWIN] =
+          exp(-(x * x) / (2 * sigma_x_d * sigma_x_d)
+              -(y * y) / (2 * sigma_y_d * sigma_y_d));
+      }
+    }
+  }
+  for (i = 1; i < BILATERAL_LEVELS + 1; i ++) {
+    const bilateral_params_t param = vp10_bilateral_level_to_params(i, 0);
+    const int sigma_x = param.sigma_x;
+    const int sigma_y = param.sigma_y;
+    const int sigma_r = param.sigma_r;
+    const double sigma_r_d = (double)sigma_r / BILATERAL_PRECISION;
+    const double sigma_x_d = (double)sigma_x / BILATERAL_PRECISION;
+    const double sigma_y_d = (double)sigma_y / BILATERAL_PRECISION;
+
+    double *fr = bilateral_filters_r[i] + BILATERAL_RANGE;
+    int j, x, y;
+    for (j = 0; j <= BILATERAL_RANGE; j++) {
+      fr[j] = exp(-(j * j) / (2 * sigma_r_d * sigma_r_d));
+      fr[-j] = fr[j];
+    }
+    for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; y++) {
+      for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; x++) {
+        bilateral_filters_s[i][y + BILATERAL_HALFWIN][x + BILATERAL_HALFWIN] =
+          exp(-(x * x) / (2 * sigma_x_d * sigma_x_d)
+              -(y * y) / (2 * sigma_y_d * sigma_y_d));
+      }
+    }
+  }
+}
+
+int vp10_bilateral_level_bits(const VP10_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME ?
+      BILATERAL_LEVEL_BITS_KF : BILATERAL_LEVEL_BITS;
+}
+
+int vp10_loop_bilateral_used(int level, int kf) {
+  const bilateral_params_t param = vp10_bilateral_level_to_params(level, kf);
+  return (param.sigma_x && param.sigma_y && param.sigma_r);
+}
+
+void vp10_loop_bilateral_init(loop_filter_info_n *lfi, int level, int kf) {
+  lfi->bilateral_used = vp10_loop_bilateral_used(level, kf);
+
+  if (lfi->bilateral_used) {
+    int i;
+    lfi->wr_lut = kf ? bilateral_filters_r_kf[level] :
+                       bilateral_filters_r[level];
+    for (i = 0; i < BILATERAL_WIN; i++)
+      lfi->wx_lut[i] = kf ? bilateral_filters_s_kf[level][i] :
+                            bilateral_filters_s[level][i];
+  }
+}
+
+static int is_in_image(int x, int y, int width, int height) {
+  return (x >= 0 && x < width && y >= 0 && y < height);
+}
+
+void loop_bilateral_filter(uint8_t *data, int width, int height,
+                           int stride, loop_filter_info_n *lfi,
+                           uint8_t *tmpdata, int tmpstride) {
+  int i, j;
+  const double *wr_lut_ = lfi->wr_lut + 256;
+
+  uint8_t *data_p = data;
+  uint8_t *tmpdata_p = tmpdata;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int x, y;
+      double flsum = 0, wtsum = 0, wt;
+      uint8_t *data_p2 = data_p + j - BILATERAL_HALFWIN * stride;
+      for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; ++y) {
+        for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; ++x) {
+          if (!is_in_image(j + x, i + y, width, height))
+            continue;
+          wt = lfi->wx_lut[y + BILATERAL_HALFWIN][x + BILATERAL_HALFWIN] *
+               wr_lut_[data_p2[x] - data_p[j]];
+          wtsum += wt;
+          flsum += wt * data_p2[x];
+        }
+        data_p2 += stride;
+      }
+      assert(wtsum > 0);
+      tmpdata_p[j] = clip_pixel((int)(flsum / wtsum + 0.5));
+    }
+    tmpdata_p += tmpstride;
+    data_p += stride;
+  }
+
+  for (i = 0; i < height; ++i) {
+    memcpy(data + i * stride, tmpdata + i * tmpstride,
+           width * sizeof(*data));
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void loop_bilateral_filter_highbd(uint8_t *data8, int width, int height,
+                                  int stride, loop_filter_info_n *lfi,
+                                  uint8_t *tmpdata8, int tmpstride,
+                                  int bit_depth) {
+  int i, j;
+  const double *wr_lut_ = lfi->wr_lut + 256;
+
+  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
+  uint16_t *tmpdata = CONVERT_TO_SHORTPTR(tmpdata8);
+  uint16_t *data_p = data;
+  uint16_t *tmpdata_p = tmpdata;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int x, y, diff_r;
+      double flsum = 0, wtsum = 0, wt;
+      uint16_t *data_p2 = data_p + j - BILATERAL_HALFWIN * stride;
+
+      for (y = -BILATERAL_HALFWIN; y <= BILATERAL_HALFWIN; ++y) {
+        for (x = -BILATERAL_HALFWIN; x <= BILATERAL_HALFWIN; ++x) {
+          if (!is_in_image(j + x, i + y, width, height))
+            continue;
+
+          diff_r = (data_p2[x] - data_p[j]) >> (bit_depth - 8);
+          assert(diff_r >= -256 && diff_r <= 256);
+
+          wt = lfi->wx_lut[y + BILATERAL_HALFWIN][x + BILATERAL_HALFWIN] *
+               wr_lut_[diff_r];
+          wtsum += wt;
+          flsum += wt * data_p2[x];
+        }
+        data_p2 += stride;
+      }
+
+      assert(wtsum > 0);
+      tmpdata_p[j] = (int)(flsum / wtsum + 0.5);
+    }
+    tmpdata_p += tmpstride;
+    data_p += stride;
+  }
+
+  for (i = 0; i < height; ++i) {
+    memcpy(data + i * stride, tmpdata + i * tmpstride,
+           width * sizeof(*data));
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_loop_bilateral_rows(YV12_BUFFER_CONFIG *frame,
+                              VP10_COMMON *cm,
+                              int start_mi_row, int end_mi_row,
+                              int y_only) {
+  const int ywidth = frame->y_crop_width;
+  const int ystride = frame->y_stride;
+  const int uvwidth = frame->uv_crop_width;
+  const int uvstride = frame->uv_stride;
+  const int ystart = start_mi_row << MI_SIZE_LOG2;
+  const int uvstart = ystart >> cm->subsampling_y;
+  int yend = end_mi_row << MI_SIZE_LOG2;
+  int uvend = yend >> cm->subsampling_y;
+  YV12_BUFFER_CONFIG *tmp_buf;
+  yend = VPXMIN(yend, cm->height);
+  uvend = VPXMIN(uvend, cm->subsampling_y ? (cm->height + 1) >> 1 : cm->height);
+
+  if (vpx_realloc_frame_buffer(&cm->tmp_loop_buf, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL) < 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate tmp restoration buffer");
+
+  tmp_buf = &cm->tmp_loop_buf;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    loop_bilateral_filter_highbd(frame->y_buffer + ystart * ystride,
+                                 ywidth, yend - ystart, ystride, &cm->lf_info,
+                                 tmp_buf->y_buffer + ystart * tmp_buf->y_stride,
+                                 tmp_buf->y_stride, cm->bit_depth);
+  else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  loop_bilateral_filter(frame->y_buffer + ystart * ystride,
+                        ywidth, yend - ystart, ystride, &cm->lf_info,
+                        tmp_buf->y_buffer + ystart * tmp_buf->y_stride,
+                        tmp_buf->y_stride);
+  if (!y_only) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    loop_bilateral_filter_highbd(
+        frame->u_buffer + uvstart * uvstride,
+        uvwidth, uvend - uvstart, uvstride, &cm->lf_info,
+        tmp_buf->u_buffer + uvstart * tmp_buf->uv_stride,
+        tmp_buf->uv_stride, cm->bit_depth);
+    loop_bilateral_filter_highbd(
+        frame->v_buffer + uvstart * uvstride,
+        uvwidth, uvend - uvstart, uvstride, &cm->lf_info,
+        tmp_buf->v_buffer + uvstart * tmp_buf->uv_stride,
+        tmp_buf->uv_stride, cm->bit_depth);
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    loop_bilateral_filter(frame->u_buffer + uvstart * uvstride,
+                          uvwidth, uvend - uvstart, uvstride, &cm->lf_info,
+                          tmp_buf->u_buffer + uvstart * tmp_buf->uv_stride,
+                          tmp_buf->uv_stride);
+    loop_bilateral_filter(frame->v_buffer + uvstart * uvstride,
+                          uvwidth, uvend - uvstart, uvstride, &cm->lf_info,
+                          tmp_buf->v_buffer + uvstart * tmp_buf->uv_stride,
+                          tmp_buf->uv_stride);
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
   int lvl;
 
@@ -252,6 +498,10 @@ void vp10_loop_filter_init(VP10_COMMON *cm) {
   // init hev threshold const vectors
   for (lvl = 0; lvl <= MAX_LOOP_FILTER; lvl++)
     memset(lfi->lfthr[lvl].hev_thr, (lvl >> 4), SIMD_WIDTH);
+
+#if CONFIG_LOOP_RESTORATION
+  vp10_loop_bilateral_precal();
+#endif  // CONFIG_LOOP_RESTORATION
 }
 
 void vp10_loop_filter_frame_init(VP10_COMMON *cm, int default_filt_lvl) {
@@ -1721,6 +1971,30 @@ void vp10_loop_filter_data_reset(
   memcpy(lf_data->planes, planes, sizeof(lf_data->planes));
 }
 
+#if CONFIG_LOOP_RESTORATION
+void vp10_loop_bilateral_frame(YV12_BUFFER_CONFIG *frame,
+                               VP10_COMMON *cm,
+                               int bilateral_level,
+                               int y_only, int partial_frame) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+  // const int loop_bilateral_used = vp10_loop_bilateral_used(
+  //     bilateral_level, cm->frame_type == KEY_FRAME);
+  vp10_loop_bilateral_init(&cm->lf_info, bilateral_level,
+                           cm->frame_type == KEY_FRAME);
+  if (!cm->lf_info.bilateral_used)
+    return;
+  start_mi_row = 0;
+  mi_rows_to_filter = cm->mi_rows;
+  if (partial_frame && cm->mi_rows > 8) {
+    start_mi_row = cm->mi_rows >> 1;
+    start_mi_row &= 0xfffffff8;
+    mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
+  }
+  end_mi_row = start_mi_row + mi_rows_to_filter;
+  vp10_loop_bilateral_rows(frame, cm, start_mi_row, end_mi_row, y_only);
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
 int vp10_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
   (void)unused;
   vp10_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
diff --git a/vp10/common/loopfilter.h b/vp10/common/loopfilter.h
index 3d764394e..02bcb26e9 100644
--- a/vp10/common/loopfilter.h
+++ b/vp10/common/loopfilter.h
@@ -28,6 +28,69 @@ extern "C" {
 
 #define MAX_MODE_LF_DELTAS      2
 
+#if CONFIG_LOOP_RESTORATION
+#define BILATERAL_LEVEL_BITS_KF 4
+#define BILATERAL_LEVELS_KF     (1 << BILATERAL_LEVEL_BITS_KF)
+#define BILATERAL_LEVEL_BITS    3
+#define BILATERAL_LEVELS        (1 << BILATERAL_LEVEL_BITS)
+#define DEF_BILATERAL_LEVEL     2
+
+#define BILATERAL_PRECISION     16
+#define BILATERAL_HALFWIN       3
+#define BILATERAL_WIN           (2 * BILATERAL_HALFWIN + 1)
+
+typedef struct bilateral_params {
+  int sigma_x;  // spatial variance x
+  int sigma_y;  // spatial variance y
+  int sigma_r;  // range variance
+} bilateral_params_t;
+
+static bilateral_params_t
+    bilateral_level_to_params_arr[BILATERAL_LEVELS + 1] = {
+  // Values are rounded to 1/16 th precision
+  {0, 0, 0},    // 0 - default
+  {8, 9, 30},
+  {9, 8, 30},
+  {9, 11, 32},
+  {11, 9, 32},
+  {14, 14, 32},
+  {18, 18, 36},
+  {24, 24, 40},
+  {32, 32, 40},
+};
+
+static bilateral_params_t
+    bilateral_level_to_params_arr_kf[BILATERAL_LEVELS_KF + 1] = {
+  // Values are rounded to 1/16 th precision
+  {0, 0, 0},    // 0 - default
+  {8, 8, 30},
+  {9, 9, 32},
+  {10, 10, 32},
+  {12, 12, 32},
+  {14, 14, 32},
+  {18, 18, 36},
+  {24, 24, 40},
+  {30, 30, 44},
+  {36, 36, 48},
+  {42, 42, 48},
+  {48, 48, 48},
+  {48, 48, 56},
+  {56, 56, 48},
+  {56, 56, 56},
+  {56, 56, 64},
+  {64, 64, 48},
+};
+
+int vp10_bilateral_level_bits(const struct VP10Common *const cm);
+int vp10_loop_bilateral_used(int level, int kf);
+
+static INLINE bilateral_params_t vp10_bilateral_level_to_params(
+    int index, int kf) {
+  return kf ? bilateral_level_to_params_arr_kf[index] :
+              bilateral_level_to_params_arr[index];
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
 enum lf_path {
   LF_PATH_420,
   LF_PATH_444,
@@ -51,6 +114,11 @@ struct loopfilter {
   // 0 = ZERO_MV, MV
   signed char mode_deltas[MAX_MODE_LF_DELTAS];
   signed char last_mode_deltas[MAX_MODE_LF_DELTAS];
+
+#if CONFIG_LOOP_RESTORATION
+  int bilateral_level;
+  int last_bilateral_level;
+#endif  // CONFIG_LOOP_RESTORATION
 };
 
 // Need to align this structure so when it is declared and
@@ -64,6 +132,14 @@ typedef struct {
 typedef struct {
   loop_filter_thresh lfthr[MAX_LOOP_FILTER + 1];
   uint8_t lvl[MAX_SEGMENTS][MAX_REF_FRAMES][MAX_MODE_LF_DELTAS];
+#if CONFIG_LOOP_RESTORATION
+  double * wx_lut[BILATERAL_WIN];
+  double * wr_lut;
+  int bilateral_sigma_x_set;
+  int bilateral_sigma_y_set;
+  int bilateral_sigma_r_set;
+  int bilateral_used;
+#endif  // CONFIG_LOOP_RESTORATION
 } loop_filter_info_n;
 
 // This structure holds bit masks for all 8x8 blocks in a 64x64 region.
@@ -133,6 +209,24 @@ void vp10_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
                           struct macroblockd_plane planes[MAX_MB_PLANE],
                           int start, int stop, int y_only);
 
+#if CONFIG_LOOP_RESTORATION
+void vp10_loop_bilateral_frame(YV12_BUFFER_CONFIG *frame,
+                               struct VP10Common *cm,
+                               int bilateral_level,
+                               int y_only, int partial_frame);
+void vp10_loop_filter_bilateral_frame(YV12_BUFFER_CONFIG *frame,
+                                      struct VP10Common *cm,
+                                      struct macroblockd *mbd,
+                                      int frame_filter_level,
+                                      int bilateral_level,
+                                      int y_only, int partial_frame);
+void vp10_loop_bilateral_init(loop_filter_info_n *lfi, int T, int kf);
+void vp10_loop_bilateral_rows(YV12_BUFFER_CONFIG *frame,
+                              struct VP10Common *cm,
+                              int start_mi_row, int end_mi_row,
+                              int y_only);
+#endif  // CONFIG_LOOP_RESTORATION
+
 typedef struct LoopFilterWorkerData {
   YV12_BUFFER_CONFIG *frame_buffer;
   struct VP10Common *cm;
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index 9b7a729be..23a20d439 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -172,6 +172,9 @@ typedef struct VP10Common {
   YV12_BUFFER_CONFIG post_proc_buffer;
   YV12_BUFFER_CONFIG post_proc_buffer_int;
 #endif
+#if CONFIG_LOOP_RESTORATION
+  YV12_BUFFER_CONFIG tmp_loop_buf;
+#endif  // CONFIG_LOOP_RESTORATION
 
   FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
 #if CONFIG_EXT_REFS
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 10fdb54f8..33e8332ea 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -2102,8 +2102,9 @@ static void setup_segmentation(VP10_COMMON *const cm,
   }
 }
 
-static void setup_loopfilter(struct loopfilter *lf,
+static void setup_loopfilter(VP10_COMMON *cm,
                              struct vpx_read_bit_buffer *rb) {
+  struct loopfilter *lf = &cm->lf;
   lf->filter_level = vpx_rb_read_literal(rb, 6);
   lf->sharpness_level = vpx_rb_read_literal(rb, 3);
 
@@ -2126,6 +2127,19 @@ static void setup_loopfilter(struct loopfilter *lf,
           lf->mode_deltas[i] = vpx_rb_read_inv_signed_literal(rb, 6);
     }
   }
+#if CONFIG_LOOP_RESTORATION
+  lf->bilateral_level = vpx_rb_read_bit(rb);
+  if (lf->bilateral_level) {
+    int level = vpx_rb_read_literal(rb, vp10_bilateral_level_bits(cm));
+    lf->bilateral_level = level + (level >= lf->last_bilateral_level);
+  } else {
+    lf->bilateral_level = lf->last_bilateral_level;
+  }
+  if (cm->frame_type != KEY_FRAME)
+    cm->lf.last_bilateral_level = cm->lf.bilateral_level;
+  else
+    cm->lf.last_bilateral_level = 0;
+#endif  // CONFIG_LOOP_RESTORATION
 }
 
 static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
@@ -3096,7 +3110,7 @@ static size_t read_uncompressed_header(VP10Decoder *pbi,
   if (frame_is_intra_only(cm) || cm->error_resilient_mode)
     vp10_setup_past_independence(cm);
 
-  setup_loopfilter(&cm->lf, rb);
+  setup_loopfilter(cm, rb);
   setup_quantization(cm, rb);
 #if CONFIG_VP9_HIGHBITDEPTH
   xd->bd = (int)cm->bit_depth;
@@ -3445,6 +3459,13 @@ void vp10_decode_frame(VP10Decoder *pbi,
   } else {
     *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
   }
+#if CONFIG_LOOP_RESTORATION
+  vp10_loop_bilateral_init(&cm->lf_info, cm->lf.bilateral_level,
+                           cm->frame_type == KEY_FRAME);
+  if (cm->lf_info.bilateral_used) {
+    vp10_loop_bilateral_rows(new_fb, cm, 0, cm->mi_rows, 0);
+  }
+#endif  // CONFIG_LOOP_RESTORATION
 
   if (!xd->corrupted) {
     if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 177dcc3f9..2e31d779a 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -1645,9 +1645,10 @@ static void update_coef_probs(VP10_COMP *cpi, vpx_writer* w) {
   }
 }
 
-static void encode_loopfilter(struct loopfilter *lf,
+static void encode_loopfilter(VP10_COMMON *cm,
                               struct vpx_write_bit_buffer *wb) {
   int i;
+  struct loopfilter *lf = &cm->lf;
 
   // Encode the loop filter level and type
   vpx_wb_write_literal(wb, lf->filter_level, 6);
@@ -1681,6 +1682,15 @@ static void encode_loopfilter(struct loopfilter *lf,
       }
     }
   }
+#if CONFIG_LOOP_RESTORATION
+  vpx_wb_write_bit(wb, lf->bilateral_level != lf->last_bilateral_level);
+  if (lf->bilateral_level != lf->last_bilateral_level) {
+    int level = lf->bilateral_level -
+                (lf->bilateral_level > lf->last_bilateral_level);
+    vpx_wb_write_literal(wb, level,
+                         vp10_bilateral_level_bits(cm));
+  }
+#endif  // CONFIG_LOOP_RESTORATION
 }
 
 static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
@@ -2139,7 +2149,7 @@ static void write_uncompressed_header(VP10_COMP *cpi,
 
   vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
 
-  encode_loopfilter(&cm->lf, wb);
+  encode_loopfilter(cm, wb);
   encode_quantization(cm, wb);
   encode_segmentation(cm, xd, wb);
   if (!cm->seg.enabled && xd->lossless[0])
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index 48d873d26..c9f5fe575 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -377,10 +377,16 @@ static void dealloc_compressor_data(VP10_COMP *cpi) {
   vp10_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
   vp10_free_postproc_buffers(cm);
-#endif
+#endif  // CONFIG_VP9_POSTPROC
+#if CONFIG_LOOP_RESTORATION
+  vp10_free_restoration_buffers(cm);
+#endif  // CONFIG_LOOP_RESTORATION
   vp10_free_context_buffers(cm);
 
   vpx_free_frame_buffer(&cpi->last_frame_uf);
+#if CONFIG_LOOP_RESTORATION
+  vpx_free_frame_buffer(&cpi->last_frame_db);
+#endif  // CONFIG_LOOP_RESTORATION
   vpx_free_frame_buffer(&cpi->scaled_source);
   vpx_free_frame_buffer(&cpi->scaled_last_source);
   vpx_free_frame_buffer(&cpi->alt_ref_buffer);
@@ -634,6 +640,19 @@ static void alloc_util_frame_buffers(VP10_COMP *cpi) {
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
+#if CONFIG_LOOP_RESTORATION
+  if (vpx_realloc_frame_buffer(&cpi->last_frame_db,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate last frame deblocked buffer");
+#endif  // CONFIG_LOOP_RESTORATION
+
   if (vpx_realloc_frame_buffer(&cpi->scaled_source,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
@@ -2759,6 +2778,12 @@ static void loopfilter_frame(VP10_COMP *cpi, VP10_COMMON *cm) {
       vp10_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
 #endif
   }
+#if CONFIG_LOOP_RESTORATION
+  vp10_loop_bilateral_init(&cm->lf_info, cm->lf.bilateral_level,
+                           cm->frame_type == KEY_FRAME);
+  if (cm->lf_info.bilateral_used)
+    vp10_loop_bilateral_rows(cm->frame_to_show, cm, 0, cm->mi_rows, 0);
+#endif  // CONFIG_LOOP_RESTORATION
 
   vpx_extend_frame_inner_borders(cm->frame_to_show);
 }
@@ -3867,6 +3892,12 @@ static void encode_frame_to_data_rate(VP10_COMP *cpi,
   cm->last2_frame_type = cm->last_frame_type;
 #endif  // CONFIG_EXT_REFS
   cm->last_frame_type = cm->frame_type;
+#if CONFIG_LOOP_RESTORATION
+  if (cm->frame_type != KEY_FRAME)
+    cm->lf.last_bilateral_level = cm->lf.bilateral_level;
+  else
+    cm->lf.last_bilateral_level = 0;
+#endif  // CONFIG_LOOP_RESTORATION
 
   vp10_rc_postencode_update(cpi, *size);
 
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 40bc4d760..797abacaf 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -338,6 +338,9 @@ typedef struct VP10_COMP {
   int ext_refresh_frame_context;
 
   YV12_BUFFER_CONFIG last_frame_uf;
+#if CONFIG_LOOP_RESTORATION
+  YV12_BUFFER_CONFIG last_frame_db;
+#endif  // CONFIG_LOOP_RESTORATION
 
   TOKENEXTRA *tile_tok[4][1 << 6];
   unsigned int tok_count[4][1 << 6];
diff --git a/vp10/encoder/picklpf.c b/vp10/encoder/picklpf.c
index 1f5711df1..9bd1555cb 100644
--- a/vp10/encoder/picklpf.c
+++ b/vp10/encoder/picklpf.c
@@ -43,15 +43,16 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
 
 #if CONFIG_VAR_TX
   vp10_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
-                        1, partial_frame);
+                         1, partial_frame);
 #else
   if (cpi->num_workers > 1)
     vp10_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
-                             filt_level, 1, partial_frame,
-                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
+                              filt_level, 1, partial_frame,
+                              cpi->workers, cpi->num_workers,
+                              &cpi->lf_row_sync);
   else
     vp10_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
-                          1, partial_frame);
+                           1, partial_frame);
 #endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -70,6 +71,177 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
   return filt_err;
 }
 
+#if CONFIG_LOOP_RESTORATION
+#define JOINT_FILTER_BILATERAL_SEARCH
+#define USE_RD_LOOP_POSTFILTER_SEARCH
+static int try_bilateral_frame(const YV12_BUFFER_CONFIG *sd,
+                               VP10_COMP *const cpi,
+                               int bilateral_level,
+                               int partial_frame) {
+  VP10_COMMON *const cm = &cpi->common;
+  int filt_err;
+  vp10_loop_bilateral_frame(cm->frame_to_show, cm,
+                            bilateral_level, 1, partial_frame);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    filt_err = vp10_highbd_get_y_sse(sd, cm->frame_to_show);
+  } else {
+    filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+  }
+#else
+  filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Re-instate the unfiltered frame
+  vpx_yv12_copy_y(&cpi->last_frame_db, cm->frame_to_show);
+  return filt_err;
+}
+
+static int search_bilateral_level(const YV12_BUFFER_CONFIG *sd,
+                                  VP10_COMP *cpi,
+                                  int filter_level, int partial_frame,
+                                  double *best_cost_ret) {
+  VP10_COMMON *const cm = &cpi->common;
+  int i, bilateral_best, err;
+  double best_cost;
+  double cost;
+  const int bilateral_level_bits = vp10_bilateral_level_bits(&cpi->common);
+  const int bilateral_levels = 1 << bilateral_level_bits;
+#ifdef USE_RD_LOOP_POSTFILTER_SEARCH
+  MACROBLOCK *x = &cpi->td.mb;
+  int bits;
+#endif
+
+  //  Make a copy of the unfiltered / processed recon buffer
+  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+  vp10_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filter_level,
+                         1, partial_frame);
+  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_db);
+
+  bilateral_best = 0;
+  err = try_bilateral_frame(sd, cpi, 0, partial_frame);
+#ifdef USE_RD_LOOP_POSTFILTER_SEARCH
+  bits = cm->lf.last_bilateral_level == 0 ? 0 : bilateral_level_bits;
+  cost = RDCOST_DBL(x->rdmult, x->rddiv, (bits << 2), err);
+#else
+  cost = (double)err;
+#endif  // USE_RD_LOOP_POSTFILTER_SEARCH
+  best_cost = cost;
+  for (i = 1; i <= bilateral_levels; ++i) {
+    err = try_bilateral_frame(sd, cpi, i, partial_frame);
+#ifdef USE_RD_LOOP_POSTFILTER_SEARCH
+    // Normally the rate is rate in bits * 256 and dist is sum sq err * 64
+    // when RDCOST is used.  However below we just scale both in the correct
+    // ratios appropriately but not exactly by these values.
+    bits = cm->lf.last_bilateral_level == i ? 0 : bilateral_level_bits;
+    cost = RDCOST_DBL(x->rdmult, x->rddiv, (bits << 2), err);
+#else
+    cost = (double)err;
+#endif  // USE_RD_LOOP_POSTFILTER_SEARCH
+    if (cost < best_cost) {
+      bilateral_best = i;
+      best_cost = cost;
+    }
+  }
+  if (best_cost_ret) *best_cost_ret = best_cost;
+  vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+  return bilateral_best;
+}
+
+#ifdef JOINT_FILTER_BILATERAL_SEARCH
+static int search_filter_bilateral_level(const YV12_BUFFER_CONFIG *sd,
+                                         VP10_COMP *cpi,
+                                         int partial_frame,
+                                         int *bilateral_level) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const struct loopfilter *const lf = &cm->lf;
+  const int min_filter_level = 0;
+  const int max_filter_level = get_max_filter_level(cpi);
+  int filt_direction = 0;
+  int filt_best, bilateral_best;
+  double best_err;
+  int i;
+
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
+  int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
+  int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  double ss_err[MAX_LOOP_FILTER + 1];
+  int bilateral;
+
+  // Set each entry to -1
+  for (i = 0; i <= MAX_LOOP_FILTER; ++i)
+    ss_err[i] = -1.0;
+
+  bilateral = search_bilateral_level(sd, cpi, filt_mid,
+                                     partial_frame, &best_err);
+  filt_best = filt_mid;
+  bilateral_best = bilateral;
+  ss_err[filt_mid] = best_err;
+
+  while (filter_step > 0) {
+    const int filt_high = VPXMIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = VPXMAX(filt_mid - filter_step, min_filter_level);
+
+    // Bias against raising loop filter in favor of lowering it.
+    double bias = (best_err / (1 << (15 - (filt_mid / 8)))) * filter_step;
+
+    if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+
+    // yx, bias less for large block size
+    if (cm->tx_mode != ONLY_4X4)
+      bias /= 2;
+
+    if (filt_direction <= 0 && filt_low != filt_mid) {
+      // Get Low filter error score
+      if (ss_err[filt_low] < 0) {
+        bilateral = search_bilateral_level(sd, cpi, filt_low,
+                                           partial_frame,
+                                           &ss_err[filt_low]);
+      }
+      // If value is close to the best so far then bias towards a lower loop
+      // filter value.
+      if ((ss_err[filt_low] - bias) < best_err) {
+        // Was it actually better than the previous best?
+        if (ss_err[filt_low] < best_err) {
+          best_err = ss_err[filt_low];
+        }
+
+        filt_best = filt_low;
+        bilateral_best = bilateral;
+      }
+    }
+
+    // Now look at filt_high
+    if (filt_direction >= 0 && filt_high != filt_mid) {
+      if (ss_err[filt_high] < 0) {
+        bilateral = search_bilateral_level(sd, cpi, filt_high, partial_frame,
+                                           &ss_err[filt_high]);
+      }
+      // Was it better than the previous best?
+      if (ss_err[filt_high] < (best_err - bias)) {
+        best_err = ss_err[filt_high];
+        filt_best = filt_high;
+        bilateral_best = bilateral;
+      }
+    }
+
+    // Half the step distance if the best filter value was the same as last time
+    if (filt_best == filt_mid) {
+      filter_step /= 2;
+      filt_direction = 0;
+    } else {
+      filt_direction = (filt_best < filt_mid) ? -1 : 1;
+      filt_mid = filt_best;
+    }
+  }
+  *bilateral_level = bilateral_best;
+  return filt_best;
+}
+#endif  // JOINT_FILTER_BILATERAL_SEARCH
+#endif  // CONFIG_LOOP_RESTORATION
+
 static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP10_COMP *cpi,
                                int partial_frame) {
   const VP10_COMMON *const cm = &cpi->common;
@@ -191,8 +363,24 @@ void vp10_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP10_COMP *cpi,
     if (cm->frame_type == KEY_FRAME)
       filt_guess -= 4;
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
+#if CONFIG_LOOP_RESTORATION
+    lf->bilateral_level = search_bilateral_level(
+        sd, cpi, lf->filter_level, method == LPF_PICK_FROM_SUBIMAGE, NULL);
+#endif  // CONFIG_LOOP_RESTORATION
   } else {
-    lf->filter_level = search_filter_level(sd, cpi,
-                                           method == LPF_PICK_FROM_SUBIMAGE);
+#if CONFIG_LOOP_RESTORATION
+#ifdef JOINT_FILTER_BILATERAL_SEARCH
+    lf->filter_level = search_filter_bilateral_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, &lf->bilateral_level);
+#else
+    lf->filter_level = search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE);
+    lf->bilateral_level = search_bilateral_level(
+        sd, cpi, lf->filter_level, method == LPF_PICK_FROM_SUBIMAGE, NULL);
+#endif  // JOINT_FILTER_BILATERAL_SEARCH
+#else
+    lf->filter_level = search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE);
+#endif  // CONFIG_LOOP_RESTORATION
   }
 }
diff --git a/vp10/encoder/rd.h b/vp10/encoder/rd.h
index 42d8ea167..2b6106d95 100644
--- a/vp10/encoder/rd.h
+++ b/vp10/encoder/rd.h
@@ -26,6 +26,10 @@ extern "C" {
 
 #define RDCOST(RM, DM, R, D) \
   (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
+
+#define RDCOST_DBL(RM, DM, R, D) \
+  (((((double)(R)) * (RM)) / 256.0) + ((double)(D)  * (1 << (DM))))
+
 #define QIDX_SKIP_THRESH     115
 
 #define MV_COST_WEIGHT      108
diff --git a/vp10/vp10_dx_iface.c b/vp10/vp10_dx_iface.c
index 33337a4bd..a0a58e85b 100644
--- a/vp10/vp10_dx_iface.c
+++ b/vp10/vp10_dx_iface.c
@@ -122,6 +122,9 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
 #if CONFIG_VP9_POSTPROC
       vp10_free_postproc_buffers(&frame_worker_data->pbi->common);
 #endif
+#if CONFIG_LOOP_RESTORATION
+      vp10_free_restoration_buffers(&frame_worker_data->pbi->common);
+#endif  // CONFIG_LOOP_RESTORATION
       vp10_decoder_remove(frame_worker_data->pbi);
       vpx_free(frame_worker_data->scratch_buffer);
 #if CONFIG_MULTITHREAD