From: Yaowu Xu <yaowu@google.com>
Date: Fri, 14 Oct 2016 15:39:03 +0000 (+0000)
Subject: Revert "Move CLPF block signals from frame to SB level."
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9b25f3067485b32442e13964df098903736c3fd8;p=libvpx

Revert "Move CLPF block signals from frame to SB level."

This reverts commit 975350387ce0b55bf5af8cb944f6a242b72251ff.

Change-Id: I9f8e891739352ca2bde4b294e37c85a668f416e0
---

diff --git a/av1/common/clpf.c b/av1/common/clpf.c
index a01e6b461..1cf52724d 100644
--- a/av1/common/clpf.c
+++ b/av1/common/clpf.c
@@ -14,6 +14,14 @@
 #include "aom/aom_image.h"
 #include "aom_dsp/aom_dsp_common.h"
 
+int av1_clpf_maxbits(const AV1_COMMON *cm) {
+  return get_msb(
+             ALIGN_POWER_OF_TWO(cm->mi_cols * MI_SIZE, cm->clpf_size + 4) *
+                 ALIGN_POWER_OF_TWO(cm->mi_rows * MI_SIZE, cm->clpf_size + 4) >>
+             (cm->clpf_size * 2 + 8)) +
+         1;
+}
+
 int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) {
   int delta = 4 * clamp(A - X, -b, b) + clamp(B - X, -b, b) +
               3 * clamp(C - X, -b, b) + 3 * clamp(D - X, -b, b) +
@@ -65,14 +73,14 @@ void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
 #endif
 
 // Return number of filtered blocks
-void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
-                    const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
-                    int enable_fb_flag, unsigned int strength,
-                    unsigned int fb_size_log2, int plane,
-                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
-                                    const YV12_BUFFER_CONFIG *,
-                                    const AV1_COMMON *cm, int, int, int,
-                                    unsigned int, unsigned int, int8_t *)) {
+int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
+                   const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
+                   int enable_fb_flag, unsigned int strength,
+                   unsigned int fb_size_log2, uint8_t *blocks, int plane,
+                   int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
+                                   const YV12_BUFFER_CONFIG *,
+                                   const AV1_COMMON *cm, int, int, int,
+                                   unsigned int, unsigned int, uint8_t *)) {
   /* Constrained low-pass filter (CLPF) */
   int c, k, l, m, n;
   const int subx = plane != AOM_PLANE_Y && frame->subsampling_x;
@@ -87,6 +95,7 @@ void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
   int dstride = bs;
   const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2;
   const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2;
+  int block_index = 0;
   uint8_t *cache = NULL;
   uint8_t **cache_ptr = NULL;
   uint8_t **cache_dst = NULL;
@@ -116,7 +125,7 @@ void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
   for (k = 0; k < num_fb_ver; k++) {
     for (l = 0; l < num_fb_hor; l++) {
       int h, w;
-      int allskip = !(enable_fb_flag && fb_size_log2 == MAX_FB_SIZE_LOG2);
+      int allskip = 1;
       const int xoff = l << fb_size_log2;
       const int yoff = k << fb_size_log2;
       for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) {
@@ -139,11 +148,8 @@ void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
       w += !w << fb_size_log2;
       if (!allskip &&  // Do not filter the block if all is skip encoded
           (!enable_fb_flag ||
-           // Only called if fb_flag enabled (luma only)
            decision(k, l, frame, org, cm, bs, w / bs, h / bs, strength,
-                    fb_size_log2,
-                    cm->clpf_blocks + yoff / MIN_FB_SIZE * cm->clpf_stride +
-                        xoff / MIN_FB_SIZE))) {
+                    fb_size_log2, blocks + block_index))) {
         // Iterate over all smaller blocks inside the filter block
         for (m = 0; m < ((h + bs - 1) >> bslog); m++) {
           for (n = 0; n < ((w + bs - 1) >> bslog); n++) {
@@ -154,9 +160,8 @@ void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
             sizey = AOMMIN(height - ypos, bs);
             if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
                                      (xpos << subx) / MI_SIZE]
-                     ->mbmi.skip ||
-                (enable_fb_flag && fb_size_log2 == MAX_FB_SIZE_LOG2)) {
-              // Temporary buffering needed for in-place filtering
+                     ->mbmi.skip) {  // Not skip block
+              // Temporary buffering needed if filtering in-place
               if (cache_ptr[cache_idx]) {
 // Copy filtered block back into the frame
 #if CONFIG_AOM_HIGHBITDEPTH
@@ -242,6 +247,7 @@ void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
           }
         }
       }
+      block_index += !allskip;  // Count number of blocks filtered
     }
   }
 
@@ -281,4 +287,6 @@ void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
   aom_free(cache);
   aom_free(cache_ptr);
   aom_free(cache_dst);
+
+  return block_index;
 }
diff --git a/av1/common/clpf.h b/av1/common/clpf.h
index fc74f2cca..8e4213b20 100644
--- a/av1/common/clpf.h
+++ b/av1/common/clpf.h
@@ -13,19 +13,17 @@
 
 #include "av1/common/reconinter.h"
 
-#define MAX_FB_SIZE_LOG2 7
-#define MIN_FB_SIZE_LOG2 5
-#define MAX_FB_SIZE (1 << MAX_FB_SIZE_LOG2)
-#define MIN_FB_SIZE (1 << MIN_FB_SIZE_LOG2)
+#define MAX_FB_SIZE 128
 
+int av1_clpf_maxbits(const AV1_COMMON *cm);
 int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b);
-void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
-                    const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
-                    int enable_fb_flag, unsigned int strength,
-                    unsigned int fb_size_log2, int plane,
-                    int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
-                                    const YV12_BUFFER_CONFIG *,
-                                    const AV1_COMMON *cm, int, int, int,
-                                    unsigned int, unsigned int, int8_t *));
+int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame,
+                   const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm,
+                   int enable_fb_flag, unsigned int strength,
+                   unsigned int fb_size_log2, uint8_t *blocks, int plane,
+                   int (*decision)(int, int, const YV12_BUFFER_CONFIG *,
+                                   const YV12_BUFFER_CONFIG *,
+                                   const AV1_COMMON *cm, int, int, int,
+                                   unsigned int, unsigned int, uint8_t *));
 
 #endif
diff --git a/av1/common/enums.h b/av1/common/enums.h
index 7c6325542..c8776ef25 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -246,17 +246,7 @@ typedef enum {
   PALETTE_COLORS
 } PALETTE_COLOR;
 
-#ifdef CONFIG_CLPF
-#define CLPF_NOFLAG -1
-typedef enum {
-  CLPF_NOSIZE = 0,
-  CLPF_32X32 = 1,
-  CLPF_64X64 = 2,
-  CLPF_128X128 = 3
-} CLPF_BLOCK_SIZE;
-#endif
 typedef enum ATTRIBUTE_PACKED {
-
   DC_PRED,    // Average of above and left pixels
   V_PRED,     // Vertical
   H_PRED,     // Horizontal
diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h
index 3a2203ac1..98f4f517c 100644
--- a/av1/common/onyxc_int.h
+++ b/av1/common/onyxc_int.h
@@ -151,27 +151,12 @@ typedef struct AV1Common {
   int use_highbitdepth;
 #endif
 #if CONFIG_CLPF
-  // Two bits are used to signal the strength for all blocks and the
-  // valid values are:
-  // 0: no filtering
-  // 1: strength = 1
-  // 2: strength = 2
-  // 3: strength = 4
+  int clpf_numblocks;
+  int clpf_size;
   int clpf_strength_y;
   int clpf_strength_u;
   int clpf_strength_v;
-
-  // If clpf_strength_y is not 0, another two bits are used to signal
-  // the filter block size.  The valid values for clfp_size are:
-  // 0: no block signalling
-  // 1: 32x32
-  // 2: 64x64
-  // 3: 128x128
-  CLPF_BLOCK_SIZE clpf_size;
-
-  // Buffer for storing whether to filter individual blocks.
-  int8_t *clpf_blocks;
-  int clpf_stride;
+  uint8_t *clpf_blocks;
 #endif
 
   YV12_BUFFER_CONFIG *frame_to_show;
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index e41f16609..e1bbe24f9 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -1440,22 +1440,6 @@ static int read_skip(AV1_COMMON *cm, const MACROBLOCKD *xd, int segment_id,
   }
 }
 #endif  // CONFIG_SUPERTX
-#if CONFIG_CLPF
-static int clpf_all_skip(const AV1_COMMON *cm, int mi_col, int mi_row,
-                         int size) {
-  int r, c;
-  int skip = 1;
-  const int maxc = AOMMIN(size, cm->mi_cols - mi_col);
-  const int maxr = AOMMIN(size, cm->mi_rows - mi_row);
-  for (r = 0; r < maxr && skip; r++) {
-    for (c = 0; c < maxc && skip; c++) {
-      skip &= !!cm->mi_grid_visible[(mi_row + r) * cm->mi_stride + mi_col + c]
-                    ->mbmi.skip;
-    }
-  }
-  return skip;
-}
-#endif
 
 // TODO(slavarnway): eliminate bsize and subsize in future commits
 static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
@@ -1788,43 +1772,6 @@ static void decode_partition(AV1Decoder *const pbi, MACROBLOCKD *const xd,
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
-
-#if CONFIG_CLPF
-  if (bsize == BLOCK_64X64 && cm->clpf_strength_y &&
-      cm->clpf_size != CLPF_NOSIZE) {
-    const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride +
-                   mi_col * MI_SIZE / MIN_FB_SIZE;
-
-    if (!((mi_row * MI_SIZE) & 127) && !((mi_col * MI_SIZE) & 127) &&
-        cm->clpf_size == CLPF_128X128) {
-      cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
-    } else if (cm->clpf_size == CLPF_64X64 &&
-               !clpf_all_skip(cm, mi_col, mi_row, 64 / MI_SIZE)) {
-      cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
-    } else if (cm->clpf_size == CLPF_32X32) {
-      const int tr = tl + 1;
-      const int bl = tl + cm->clpf_stride;
-      const int br = tr + cm->clpf_stride;
-      const int size = 32 / MI_SIZE;
-
-      // Up to four bits per SB
-      if (!clpf_all_skip(cm, mi_col, mi_row, size))
-        cm->clpf_blocks[tl] = aom_read_literal(r, 1, ACCT_STR);
-
-      if (mi_col + size < cm->mi_cols &&
-          !clpf_all_skip(cm, mi_col + size, mi_row, size))
-        cm->clpf_blocks[tr] = aom_read_literal(r, 1, ACCT_STR);
-
-      if (mi_row + size < cm->mi_rows &&
-          !clpf_all_skip(cm, mi_col, mi_row + size, size))
-        cm->clpf_blocks[bl] = aom_read_literal(r, 1, ACCT_STR);
-
-      if (mi_col + size < cm->mi_cols && mi_row + size < cm->mi_rows &&
-          !clpf_all_skip(cm, mi_col + size, mi_row + size, size))
-        cm->clpf_blocks[br] = aom_read_literal(r, 1, ACCT_STR);
-    }
-  }
-#endif
 #if CONFIG_DERING
   if (bsize == BLOCK_64X64) {
     if (cm->dering_level != 0 && !sb_all_skip(cm, mi_row, mi_col)) {
@@ -2098,26 +2045,20 @@ static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
 }
 
 #if CONFIG_CLPF
-static void setup_clpf(AV1Decoder *pbi, struct aom_read_bit_buffer *rb) {
-  AV1_COMMON *const cm = &pbi->common;
-  const int width = pbi->cur_buf->buf.y_crop_width;
-  const int height = pbi->cur_buf->buf.y_crop_height;
-
+static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) {
   cm->clpf_blocks = 0;
   cm->clpf_strength_y = aom_rb_read_literal(rb, 2);
   cm->clpf_strength_u = aom_rb_read_literal(rb, 2);
   cm->clpf_strength_v = aom_rb_read_literal(rb, 2);
   if (cm->clpf_strength_y) {
     cm->clpf_size = aom_rb_read_literal(rb, 2);
-    if (cm->clpf_size != CLPF_NOSIZE) {
-      int size;
-      cm->clpf_stride =
-          ((width + MIN_FB_SIZE - 1) & ~(MIN_FB_SIZE - 1)) >> MIN_FB_SIZE_LOG2;
-      size =
-          cm->clpf_stride * ((height + MIN_FB_SIZE - 1) & ~(MIN_FB_SIZE - 1)) >>
-          MIN_FB_SIZE_LOG2;
-      CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(size));
-      memset(cm->clpf_blocks, -1, size);
+    if (cm->clpf_size) {
+      int i;
+      cm->clpf_numblocks = aom_rb_read_literal(rb, av1_clpf_maxbits(cm));
+      CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(cm->clpf_numblocks));
+      for (i = 0; i < cm->clpf_numblocks; i++) {
+        cm->clpf_blocks[i] = aom_rb_read_literal(rb, 1);
+      }
     }
   }
 }
@@ -2127,7 +2068,7 @@ static int clpf_bit(UNUSED int k, UNUSED int l,
                     UNUSED const YV12_BUFFER_CONFIG *org,
                     UNUSED const AV1_COMMON *cm, UNUSED int block_size,
                     UNUSED int w, UNUSED int h, UNUSED unsigned int strength,
-                    UNUSED unsigned int fb_size_log2, int8_t *bit) {
+                    UNUSED unsigned int fb_size_log2, uint8_t *bit) {
   return *bit;
 }
 #endif
@@ -3420,7 +3361,7 @@ static size_t read_uncompressed_header(AV1Decoder *pbi,
 
   setup_loopfilter(cm, rb);
 #if CONFIG_CLPF
-  setup_clpf(pbi, rb);
+  setup_clpf(cm, rb);
 #endif
 #if CONFIG_DERING
   setup_dering(cm, rb);
@@ -3992,18 +3933,18 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data,
   if (!cm->skip_loop_filter) {
     const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf;
     if (cm->clpf_strength_y) {
-      av1_clpf_frame(frame, NULL, cm, cm->clpf_size != CLPF_NOSIZE,
+      av1_clpf_frame(frame, NULL, cm, !!cm->clpf_size,
                      cm->clpf_strength_y + (cm->clpf_strength_y == 3),
-                     4 + cm->clpf_size, AOM_PLANE_Y, clpf_bit);
+                     4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, clpf_bit);
     }
     if (cm->clpf_strength_u) {
-      av1_clpf_frame(frame, NULL, cm, 0,  // No block signals for chroma
-                     cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4,
+      av1_clpf_frame(frame, NULL, cm, 0,
+                     cm->clpf_strength_u + (cm->clpf_strength_u == 3), 4, NULL,
                      AOM_PLANE_U, NULL);
     }
     if (cm->clpf_strength_v) {
-      av1_clpf_frame(frame, NULL, cm, 0,  // No block signals for chroma
-                     cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4,
+      av1_clpf_frame(frame, NULL, cm, 0,
+                     cm->clpf_strength_v + (cm->clpf_strength_v == 3), 4, NULL,
                      AOM_PLANE_V, NULL);
     }
   }
diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c
index 6604728d6..aaffebbcb 100644
--- a/av1/encoder/bitstream.c
+++ b/av1/encoder/bitstream.c
@@ -1869,37 +1869,6 @@ static void write_modes_sb(AV1_COMP *const cpi, const TileInfo *const tile,
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 
-#if CONFIG_CLPF
-  if (bsize == BLOCK_64X64 && cm->clpf_blocks && cm->clpf_strength_y &&
-      cm->clpf_size != CLPF_NOSIZE) {
-    const int tl = mi_row * MI_SIZE / MIN_FB_SIZE * cm->clpf_stride +
-                   mi_col * MI_SIZE / MIN_FB_SIZE;
-    const int tr = tl + 1;
-    const int bl = tl + cm->clpf_stride;
-    const int br = tr + cm->clpf_stride;
-
-    // Up to four bits per SB.
-    // When clpf_size indicates a size larger than the SB size
-    // (CLPF_128X128), one bit for every fourth SB will be transmitted
-    // regardless of skip blocks.
-    if (cm->clpf_blocks[tl] != CLPF_NOFLAG)
-      aom_write_literal(w, cm->clpf_blocks[tl], 1);
-
-    if (mi_col + MI_SIZE / 2 < cm->mi_cols &&
-        cm->clpf_blocks[tr] != CLPF_NOFLAG)
-      aom_write_literal(w, cm->clpf_blocks[tr], 1);
-
-    if (mi_row + MI_SIZE / 2 < cm->mi_rows &&
-        cm->clpf_blocks[bl] != CLPF_NOFLAG)
-      aom_write_literal(w, cm->clpf_blocks[bl], 1);
-
-    if (mi_row + MI_SIZE / 2 < cm->mi_rows &&
-        mi_col + MI_SIZE / 2 < cm->mi_cols &&
-        cm->clpf_blocks[br] != CLPF_NOFLAG)
-      aom_write_literal(w, cm->clpf_blocks[br], 1);
-  }
-#endif
-
 #if CONFIG_DERING
   if (bsize == BLOCK_64X64 && cm->dering_level != 0 &&
       !sb_all_skip(cm, mi_row, mi_col)) {
@@ -2564,6 +2533,18 @@ static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) {
   aom_wb_write_literal(wb, cm->clpf_strength_v, 2);
   if (cm->clpf_strength_y) {
     aom_wb_write_literal(wb, cm->clpf_size, 2);
+    if (cm->clpf_size) {
+      int i;
+      // TODO(stemidts): The number of bits to transmit could be
+      // implicitly deduced if transmitted after the filter block or
+      // after the frame (when it's known whether the block is all
+      // skip and implicitly unfiltered).  And the bits do not have
+      // 50% probability, so a more efficient coding is possible.
+      aom_wb_write_literal(wb, cm->clpf_numblocks, av1_clpf_maxbits(cm));
+      for (i = 0; i < cm->clpf_numblocks; i++) {
+        aom_wb_write_literal(wb, cm->clpf_blocks ? cm->clpf_blocks[i] : 0, 1);
+      }
+    }
   }
 }
 #endif
diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c
index 4e652b6cb..1d498f11b 100644
--- a/av1/encoder/clpf_rdo.c
+++ b/av1/encoder/clpf_rdo.c
@@ -127,15 +127,14 @@ void aom_clpf_detect_multi_hbd_c(const uint16_t *rec, const uint16_t *org,
 int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
                       const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                       int block_size, int w, int h, unsigned int strength,
-                      unsigned int fb_size_log2, int8_t *res) {
+                      unsigned int fb_size_log2, uint8_t *res) {
   int m, n, sum0 = 0, sum1 = 0;
 
   for (m = 0; m < h; m++) {
     for (n = 0; n < w; n++) {
       int xpos = (l << fb_size_log2) + n * block_size;
       int ypos = (k << fb_size_log2) + m * block_size;
-      if (fb_size_log2 == MAX_FB_SIZE_LOG2 ||
-          !cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE]
+      if (!cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE]
                ->mbmi.skip) {
 #if CONFIG_AOM_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
@@ -168,8 +167,6 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
 // (Only for luma:)
 // res[1][0]   : (bit count, fb size = 128)
 // res[1][1-3] : strength=1,2,4, fb size = 128
-// res[1][4]   : unfiltered, including skip
-// res[1][5-7] : strength=1,2,4, including skip, fb_size = 128
 // res[2][0]   : (bit count, fb size = 64)
 // res[2][1-3] : strength=1,2,4, fb size = 64
 // res[3][0]   : (bit count, fb size = 32)
@@ -177,9 +174,9 @@ int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
 static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
                     const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                     unsigned int block_size, unsigned int fb_size_log2, int w,
-                    int h, int64_t res[4][8], int plane) {
+                    int h, int64_t res[4][4], int plane) {
   int c, m, n, filtered = 0;
-  int sum[8];
+  int sum[4];
   const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
   const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
   int bslog = get_msb(block_size);
@@ -196,12 +193,12 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
       plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
   int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
   int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
-  sum[0] = sum[1] = sum[2] = sum[3] = sum[4] = sum[5] = sum[6] = sum[7] = 0;
+  sum[0] = sum[1] = sum[2] = sum[3] = 0;
   if (plane == AOM_PLANE_Y &&
       fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
     int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;
 
-    filtered = fb_size_log2-- == MAX_FB_SIZE_LOG2;
+    fb_size_log2--;
     w1 = AOMMIN(1 << (fb_size_log2 - bslog), w);
     h1 = AOMMIN(1 << (fb_size_log2 - bslog), h);
     w2 = AOMMIN(w - (1 << (fb_size_log2 - bslog)), w >> 1);
@@ -213,8 +210,8 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
     oldfiltered = res[i][0];
     res[i][0] = 0;
 
-    filtered |= clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
-                         res, plane);
+    filtered = clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
+                        res, plane);
     if (1 << (fb_size_log2 - bslog) < w)
       filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
                            fb_size_log2, w2, h1, res, plane);
@@ -226,18 +223,10 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
                    cm, block_size, fb_size_log2, w2, h2, res, plane);
     }
 
-    // Correct sums for unfiltered blocks
     res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
     res[i][2] = AOMMIN(sum2 + res[i][0], res[i][2]);
     res[i][3] = AOMMIN(sum3 + res[i][0], res[i][3]);
-    if (i == 1) {
-      res[i][5] = AOMMIN(sum1 + res[i][4], res[i][5]);
-      res[i][6] = AOMMIN(sum2 + res[i][4], res[i][6]);
-      res[i][7] = AOMMIN(sum3 + res[i][4], res[i][7]);
-    }
-
     res[i][0] = oldfiltered + filtered;  // Number of signal bits
-
     return filtered;
   }
 
@@ -245,28 +234,27 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
     for (n = 0; n < w; n++) {
       int xpos = x + n * block_size;
       int ypos = y + m * block_size;
-      int skip =  // Filtered skip blocks stored only for fb_size == 128
-          4 *
-          !!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
-                                (xpos << subx) / MI_SIZE]
-                ->mbmi.skip;
+      if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride +
+                               (xpos << subx) / MI_SIZE]
+               ->mbmi.skip) {
 #if CONFIG_AOM_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        aom_clpf_detect_multi_hbd(CONVERT_TO_SHORTPTR(rec_buffer),
-                                  CONVERT_TO_SHORTPTR(org_buffer), rec_stride,
-                                  org_stride, xpos, ypos, rec_width, rec_height,
-                                  sum + skip, cm->bit_depth - 8, block_size);
-      } else {
+        if (cm->use_highbitdepth) {
+          aom_clpf_detect_multi_hbd(
+              CONVERT_TO_SHORTPTR(rec_buffer), CONVERT_TO_SHORTPTR(org_buffer),
+              rec_stride, org_stride, xpos, ypos, rec_width, rec_height, sum,
+              cm->bit_depth - 8, block_size);
+        } else {
+          aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
+                                xpos, ypos, rec_width, rec_height, sum,
+                                block_size);
+        }
+#else
         aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
-                              xpos, ypos, rec_width, rec_height, sum + skip,
+                              xpos, ypos, rec_width, rec_height, sum,
                               block_size);
-      }
-#else
-      aom_clpf_detect_multi(rec_buffer, org_buffer, rec_stride, org_stride,
-                            xpos, ypos, rec_width, rec_height, sum + skip,
-                            block_size);
 #endif
-      filtered |= !skip;
+        filtered = 1;
+      }
     }
   }
 
@@ -275,12 +263,6 @@ static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
     res[c][1] += sum[1];
     res[c][2] += sum[2];
     res[c][3] += sum[3];
-    if (c != 1) continue;
-    // Only needed when fb_size == 128
-    res[c][4] += sum[4];
-    res[c][5] += sum[5];
-    res[c][6] += sum[6];
-    res[c][7] += sum[7];
   }
   return filtered;
 }
@@ -289,7 +271,7 @@ void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
                          const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                          int *best_strength, int *best_bs, int plane) {
   int c, j, k, l;
-  int64_t best, sums[4][8];
+  int64_t best, sums[4][4];
   int width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
   int height = plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
   const int bs = MI_SIZE;
@@ -321,14 +303,8 @@ void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
       }
     }
 
-  // For fb_size == 128 skip blocks are included in the result.
-  if (plane == AOM_PLANE_Y) {
-    sums[1][1] += sums[1][5] - sums[1][4];
-    sums[1][2] += sums[1][6] - sums[1][4];
-    sums[1][3] += sums[1][7] - sums[1][4];
-  } else {  // Slightly favour unfiltered chroma
+  if (plane != AOM_PLANE_Y)  // Slightly favour unfiltered chroma
     sums[0][0] -= sums[0][0] >> 7;
-  }
 
   for (j = 0; j < 4; j++) {
     static const double lambda_square[] = {
diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h
index 586eed03d..bb85fbcea 100644
--- a/av1/encoder/clpf_rdo.h
+++ b/av1/encoder/clpf_rdo.h
@@ -17,7 +17,7 @@
 int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec,
                       const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                       int block_size, int w, int h, unsigned int strength,
-                      unsigned int fb_size_log2, int8_t *res);
+                      unsigned int fb_size_log2, uint8_t *res);
 
 void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec,
                          const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index af7e5a63b..90b0416fc 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -3408,23 +3408,12 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
   }
 #if CONFIG_CLPF
   cm->clpf_strength_y = cm->clpf_strength_u = cm->clpf_strength_v = 0;
-  cm->clpf_size = CLPF_64X64;
-
-  // Allocate buffer to hold the status of all filter blocks:
-  // 1 = On, 0 = off, -1 = implicitly off
-  {
-    int size;
-    cm->clpf_stride = ((cm->frame_to_show->y_crop_width + MIN_FB_SIZE - 1) &
-                       ~(MIN_FB_SIZE - 1)) >>
-                      MIN_FB_SIZE_LOG2;
-    size = cm->clpf_stride *
-               ((cm->frame_to_show->y_crop_height + MIN_FB_SIZE - 1) &
-                ~(MIN_FB_SIZE - 1)) >>
-           MIN_FB_SIZE_LOG2;
-    CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(size));
-    memset(cm->clpf_blocks, CLPF_NOFLAG, size);
-  }
-
+  cm->clpf_size = 2;
+  CHECK_MEM_ERROR(
+      cm, cm->clpf_blocks,
+      aom_malloc(((cm->frame_to_show->y_crop_width + 31) & ~31) *
+                     ((cm->frame_to_show->y_crop_height + 31) & ~31) >>
+                 10));
   if (!is_lossless_requested(&cpi->oxcf)) {
     const YV12_BUFFER_CONFIG *const frame = cm->frame_to_show;
 
@@ -3439,18 +3428,20 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) {
       // Apply the filter using the chosen strength
       cm->clpf_strength_y = strength_y - (strength_y == 4);
       cm->clpf_size =
-          fb_size_log2 ? fb_size_log2 - MAX_FB_SIZE_LOG2 + 3 : CLPF_NOSIZE;
-      av1_clpf_frame(frame, cpi->Source, cm, cm->clpf_size != CLPF_NOSIZE,
-                     strength_y, 4 + cm->clpf_size, AOM_PLANE_Y,
-                     av1_clpf_decision);
+          fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0;
+      cm->clpf_numblocks = av1_clpf_frame(
+          frame, cpi->Source, cm, !!cm->clpf_size, strength_y,
+          4 + cm->clpf_size, cm->clpf_blocks, AOM_PLANE_Y, av1_clpf_decision);
     }
     if (strength_u) {
       cm->clpf_strength_u = strength_u - (strength_u == 4);
-      av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, AOM_PLANE_U, NULL);
+      av1_clpf_frame(frame, NULL, cm, 0, strength_u, 4, NULL, AOM_PLANE_U,
+                     NULL);
     }
     if (strength_v) {
       cm->clpf_strength_v = strength_v - (strength_v == 4);
-      av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, AOM_PLANE_V, NULL);
+      av1_clpf_frame(frame, NULL, cm, 0, strength_v, 4, NULL, AOM_PLANE_V,
+                     NULL);
     }
   }
 #endif