From: Steinar Midtskogen Date: Fri, 6 May 2016 11:48:20 +0000 (+0200) Subject: New CLPF: New kernel and RDO for strength and block size X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d06588ab18903f774f04d74ac0960b5c01166d77;p=libvpx New CLPF: New kernel and RDO for strength and block size This commit ports a CLPF change from aom/master by manually cherry-picking: 7560123c066854aa40c4685625454aea03410b18 Change-Id: I61eb08862a101df74a6b65ece459833401e81117 --- diff --git a/av1/av1_common.mk b/av1/av1_common.mk index 26ef9c5f9..eba37dcf2 100644 --- a/av1/av1_common.mk +++ b/av1/av1_common.mk @@ -86,8 +86,10 @@ ifeq (yes,$(filter $(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION),yes)) AV1_COMMON_SRCS-yes += common/warped_motion.h AV1_COMMON_SRCS-yes += common/warped_motion.c endif +ifeq ($(CONFIG_CLPF),yes) AV1_COMMON_SRCS-yes += common/clpf.c AV1_COMMON_SRCS-yes += common/clpf.h +endif ifeq ($(CONFIG_DERING),yes) AV1_COMMON_SRCS-yes += common/od_dering.c AV1_COMMON_SRCS-yes += common/od_dering.h diff --git a/av1/av1_cx.mk b/av1/av1_cx.mk index 2bb405cee..b3a812c38 100644 --- a/av1/av1_cx.mk +++ b/av1/av1_cx.mk @@ -101,6 +101,10 @@ AV1_CX_SRCS-yes += encoder/mbgraph.h ifeq ($(CONFIG_DERING),yes) AV1_CX_SRCS-yes += encoder/pickdering.c endif +ifeq ($(CONFIG_CLPF),yes) +AV1_CX_SRCS-yes += encoder/clpf_rdo.c +AV1_CX_SRCS-yes += encoder/clpf_rdo.h +endif AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm AV1_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c ifeq ($(CONFIG_AOM_HIGHBITDEPTH),yes) diff --git a/av1/common/clpf.c b/av1/common/clpf.c index 095c2a8eb..861dde6dc 100644 --- a/av1/common/clpf.c +++ b/av1/common/clpf.c @@ -9,96 +9,119 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include "av1/common/clpf.h" +#include "aom_dsp/aom_dsp_common.h" -// Apply the filter on a single block -static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride, - int dstride, int has_top, int has_left, int has_bottom, - int has_right, int width, int height) { - int x, y; +int av1_clpf_maxbits(const AV1_COMMON *cm) { + return get_msb( + ALIGN_POWER_OF_TWO(cm->mi_cols * MAX_MIB_SIZE, cm->clpf_size + 4) * + ALIGN_POWER_OF_TWO(cm->mi_rows * MAX_MIB_SIZE, + cm->clpf_size + 4) >> + (cm->clpf_size * 2 + 8)) + + 1; +} + +int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) { + int delta = 4 * clamp(A - X, -b, b) + clamp(B - X, -b, b) + + 3 * clamp(C - X, -b, b) + 3 * clamp(D - X, -b, b) + + clamp(E - X, -b, b) + 4 * clamp(F - X, -b, b); + return (8 + delta - (delta < 0)) >> 4; +} - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) { - int X = src[(y + 0) * sstride + x + 0]; - int A = has_top ? src[(y - 1) * sstride + x + 0] : X; - int B = has_left ? src[(y + 0) * sstride + x - 1] : X; - int C = has_right ? src[(y + 0) * sstride + x + 1] : X; - int D = has_bottom ? src[(y + 1) * sstride + x + 0] : X; - int delta = ((A > X) + (B > X) + (C > X) + (D > X) > 2) - - ((A < X) + (B < X) + (C < X) + (D < X) > 2); - dst[y * dstride + x] = X + delta; +static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, + int y0, int sizex, int sizey, int width, int height, + unsigned int strength) { + int x, y; + for (y = y0; y < y0 + sizey; y++) { + for (x = x0; x < x0 + sizex; x++) { + int X = src[y * stride + x]; + int A = src[AOMMAX(0, y - 1) * stride + x]; + int B = src[y * stride + AOMMAX(0, x - 2)]; + int C = src[y * stride + AOMMAX(0, x - 1)]; + int D = src[y * stride + AOMMIN(width - 1, x + 1)]; + int E = src[y * stride + AOMMIN(width - 1, x + 2)]; + int F = src[AOMMIN(height - 1, y + 1) * stride + x]; + int delta; + delta = av1_clpf_sample(X, A, B, C, D, E, F, strength); + dst[y * stride + x] = X + delta; } } } -#define BS (MI_SIZE * MAX_MIB_SIZE) - -// Iterate over blocks within a superblock -static void av1_clpf_sb(const YV12_BUFFER_CONFIG *frame_buffer, - const AV1_COMMON *cm, MACROBLOCKD *xd, - MODE_INFO *const *mi_8x8, int xpos, int ypos) { - // Temporary buffer (to allow SIMD parallelism) - uint8_t buf_unaligned[BS * BS + 15]; - uint8_t *buf = (uint8_t *)(((intptr_t)buf_unaligned + 15) & ~15); - int x, y, p; +// Return number of filtered blocks +int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, + const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, + int enable_fb_flag, unsigned int strength, + unsigned int fb_size_log2, uint8_t *blocks, + int (*decision)(int, int, const YV12_BUFFER_CONFIG *, + const YV12_BUFFER_CONFIG *, + const AV1_COMMON *cm, int, int, int, + unsigned int, unsigned int, uint8_t *)) { + /* Constrained low-pass filter (CLPF) */ + int c, k, l, m, n; + int width = rec->y_crop_width; + int height = rec->y_crop_height; + int xpos, ypos; + int stride_y = rec->y_stride; + int stride_c = rec->uv_stride; + const int bs = MAX_MIB_SIZE; + int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2; + int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2; + int block_index = 0; - for (p = 0; p < (CLPF_FILTER_ALL_PLANES ? MAX_MB_PLANE : 1); p++) { - for (y = 0; y < MAX_MIB_SIZE && ypos + y < cm->mi_rows; y++) { - for (x = 0; x < MAX_MIB_SIZE && xpos + x < cm->mi_cols; x++) { - const MB_MODE_INFO *mbmi = - &mi_8x8[(ypos + y) * cm->mi_stride + xpos + x]->mbmi; - - // Do not filter if there is no residual - if (!mbmi->skip) { - // Do not filter frame edges - int has_top = ypos + y > 0; - int has_left = xpos + x > 0; - int has_bottom = ypos + y < cm->mi_rows - 1; - int has_right = xpos + x < cm->mi_cols - 1; -#if CLPF_ALLOW_BLOCK_PARALLELISM - // Do not filter superblock edges - has_top &= !!y; - has_left &= !!x; - has_bottom &= y != MAX_MIB_SIZE - 1; - has_right &= x != MAX_MIB_SIZE - 1; -#endif - av1_setup_dst_planes(xd->plane, frame_buffer, ypos + y, xpos + x); - clpf_block( - xd->plane[p].dst.buf, CLPF_ALLOW_PIXEL_PARALLELISM - ? buf + y * MI_SIZE * BS + x * MI_SIZE - : xd->plane[p].dst.buf, - xd->plane[p].dst.stride, - CLPF_ALLOW_PIXEL_PARALLELISM ? BS : xd->plane[p].dst.stride, - has_top, has_left, has_bottom, has_right, - MI_SIZE >> xd->plane[p].subsampling_x, - MI_SIZE >> xd->plane[p].subsampling_y); + // Iterate over all filter blocks + for (k = 0; k < num_fb_ver; k++) { + for (l = 0; l < num_fb_hor; l++) { + int h, w; + int allskip = 1; + for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) { + for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) { + xpos = (l << fb_size_log2) + n * bs; + ypos = (k << fb_size_log2) + m * bs; + if (xpos < width && ypos < height) { + allskip &= + cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] + ->mbmi.skip; + } } } - } -#if CLPF_ALLOW_PIXEL_PARALLELISM - for (y = 0; y < MAX_MIB_SIZE && ypos + y < cm->mi_rows; y++) { - for (x = 0; x < MAX_MIB_SIZE && xpos + x < cm->mi_cols; x++) { - const MB_MODE_INFO *mbmi = - &mi_8x8[(ypos + y) * cm->mi_stride + xpos + x]->mbmi; - av1_setup_dst_planes(xd->plane, frame_buffer, ypos + y, xpos + x); - if (!mbmi->skip) { - int i = 0; - for (i = 0; i> xd->plane[p].subsampling_y; i++) - memcpy(xd->plane[p].dst.buf + i * xd->plane[p].dst.stride, - buf + (y * MI_SIZE + i) * BS + x * MI_SIZE, - MI_SIZE >> xd->plane[p].subsampling_x); + + // Calculate the actual filter block size near frame edges + h = AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); + w = AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); + h += !h << fb_size_log2; + w += !w << fb_size_log2; + if (!allskip && // Do not filter the block if all is skip encoded + (!enable_fb_flag || + decision(k, l, rec, org, cm, bs, w / bs, h / bs, strength, + fb_size_log2, blocks + block_index))) { + // Iterate over all smaller blocks inside the filter block + for (m = 0; m < (h + bs - 1) / bs; m++) { + for (n = 0; n < (w + bs - 1) / bs; n++) { + xpos = (l << fb_size_log2) + n * bs; + ypos = (k << fb_size_log2) + m * bs; + if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] + ->mbmi.skip) { + // Not skip block, apply the filter + clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos, bs, + bs, width, height, strength); + } else { // Skip block, copy instead + for (c = 0; c < bs; c++) + *(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) = + *(uint64_t *)(rec->y_buffer + (ypos + c) * stride_y + xpos); + } + } } + } else { // Entire filter block is skip, copy + for (m = 0; m < h; m++) + memcpy(dst->y_buffer + ((k << fb_size_log2) + m) * stride_y + + (l << fb_size_log2), + rec->y_buffer + ((k << fb_size_log2) + m) * stride_y + + (l << fb_size_log2), + w); } + block_index += !allskip; // Count number of blocks filtered } -#endif } -} - -// Iterate over the superblocks of an entire frame -void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, - MACROBLOCKD *xd) { - int x, y; - for (y = 0; y < cm->mi_rows; y += MAX_MIB_SIZE) - for (x = 0; x < cm->mi_cols; x += MAX_MIB_SIZE) - av1_clpf_sb(frame, cm, xd, cm->mi_grid_visible, x, y); + return block_index; } diff --git a/av1/common/clpf.h b/av1/common/clpf.h index 0b352f65b..21671a1c1 100644 --- a/av1/common/clpf.h +++ b/av1/common/clpf.h @@ -13,15 +13,17 @@ #include "av1/common/reconinter.h" -// Configuration -#define CLPF_ALLOW_PIXEL_PARALLELISM \ - 1 // 1 = SIMD friendly (adds a buffer requirement) -#define CLPF_ALLOW_BLOCK_PARALLELISM \ - 0 // 1 = MT friendly (degrades quality slighty) -#define CLPF_FILTER_ALL_PLANES \ - 0 // 1 = filter both luma and chroma, 0 = filter only luma +#define MAX_FB_SIZE 128 -void av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, const AV1_COMMON *cm, - MACROBLOCKD *xd); +int av1_clpf_maxbits(const AV1_COMMON *cm); +int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b); +int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, + const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, + int enable_fb_flag, unsigned int strength, + unsigned int fb_size_log2, uint8_t *blocks, + int (*decision)(int, int, const YV12_BUFFER_CONFIG *, + const YV12_BUFFER_CONFIG *, + const AV1_COMMON *cm, int, int, int, + unsigned int, unsigned int, uint8_t *)); #endif diff --git a/av1/common/onyxc_int.h b/av1/common/onyxc_int.h index 62b2c7ad3..2bb680a20 100644 --- a/av1/common/onyxc_int.h +++ b/av1/common/onyxc_int.h @@ -151,7 +151,10 @@ typedef struct AV1Common { int use_highbitdepth; #endif #if CONFIG_CLPF - int clpf; + int clpf_numblocks; + int clpf_size; + int clpf_strength; + uint8_t *clpf_blocks; #endif YV12_BUFFER_CONFIG *frame_to_show; diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c index a3c751c34..51ef4758d 100644 --- a/av1/decoder/decodeframe.c +++ b/av1/decoder/decodeframe.c @@ -2044,7 +2044,26 @@ static void setup_loopfilter(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { #if CONFIG_CLPF static void setup_clpf(AV1_COMMON *cm, struct aom_read_bit_buffer *rb) { - cm->clpf = aom_rb_read_literal(rb, 1); + cm->clpf_blocks = 0; + cm->clpf_strength = aom_rb_read_literal(rb, 2); + if (cm->clpf_strength) { + cm->clpf_size = aom_rb_read_literal(rb, 2); + if (cm->clpf_size) { + int i; + cm->clpf_numblocks = aom_rb_read_literal(rb, av1_clpf_maxbits(cm)); + CHECK_MEM_ERROR(cm, cm->clpf_blocks, aom_malloc(cm->clpf_numblocks)); + for (i = 0; i < cm->clpf_numblocks; i++) { + cm->clpf_blocks[i] = aom_rb_read_literal(rb, 1); + } + } + } +} + +static int clpf_bit(int k, int l, const YV12_BUFFER_CONFIG *rec, + const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, + int block_size, int w, int h, unsigned int strength, + unsigned int fb_size_log2, uint8_t *bit) { + return *bit; } #endif @@ -3906,8 +3925,22 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data, #endif // CONFIG_LOOP_RESTORATION #if CONFIG_CLPF - if (cm->clpf && !cm->skip_loop_filter) - av1_clpf_frame(&pbi->cur_buf->buf, cm, &pbi->mb); + if (cm->clpf_strength && !cm->skip_loop_filter) { + YV12_BUFFER_CONFIG dst; // Buffer for the result + + dst = pbi->cur_buf->buf; + CHECK_MEM_ERROR(cm, dst.y_buffer, aom_malloc(dst.y_stride * dst.y_height)); + + av1_clpf_frame(&dst, &pbi->cur_buf->buf, 0, cm, !!cm->clpf_size, + cm->clpf_strength + (cm->clpf_strength == 3), + 4 + cm->clpf_size, cm->clpf_blocks, clpf_bit); + + // Copy result + memcpy(pbi->cur_buf->buf.y_buffer, dst.y_buffer, + dst.y_height * dst.y_stride); + aom_free(dst.y_buffer); + } + if (cm->clpf_blocks) aom_free(cm->clpf_blocks); #endif #if CONFIG_DERING if (cm->dering_level && !cm->skip_loop_filter) { diff --git a/av1/encoder/bitstream.c b/av1/encoder/bitstream.c index a01baa253..bd3388110 100644 --- a/av1/encoder/bitstream.c +++ b/av1/encoder/bitstream.c @@ -2590,7 +2590,22 @@ static void encode_loopfilter(AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { #if CONFIG_CLPF static void encode_clpf(const AV1_COMMON *cm, struct aom_write_bit_buffer *wb) { - aom_wb_write_literal(wb, cm->clpf, 1); + aom_wb_write_literal(wb, cm->clpf_strength, 2); + if (cm->clpf_strength) { + aom_wb_write_literal(wb, cm->clpf_size, 2); + if (cm->clpf_size) { + int i; + // TODO(stemidts): The number of bits to transmit could be + // implicitly deduced if transmitted after the filter block or + // after the frame (when it's known whether the block is all + // skip and implicitly unfiltered). And the bits do not have + // 50% probability, so a more efficient coding is possible. + aom_wb_write_literal(wb, cm->clpf_numblocks, av1_clpf_maxbits(cm)); + for (i = 0; i < cm->clpf_numblocks; i++) { + aom_wb_write_literal(wb, cm->clpf_blocks[i], 1); + } + } + } } #endif diff --git a/av1/encoder/clpf_rdo.c b/av1/encoder/clpf_rdo.c new file mode 100644 index 000000000..7710de41a --- /dev/null +++ b/av1/encoder/clpf_rdo.c @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include "av1/common/clpf.h" +#include "aom/aom_integer.h" +#include "av1/common/quant_common.h" + +// Calculate the error of a filtered and unfiltered block +static void detect_clpf(const uint8_t *rec, const uint8_t *org, int x0, int y0, + int width, int height, int so, int stride, int *sum0, + int *sum1, unsigned int strength) { + int x, y; + for (y = y0; y < y0 + 8; y++) { + for (x = x0; x < x0 + 8; x++) { + int O = org[y * so + x]; + int X = rec[y * stride + x]; + int A = rec[AOMMAX(0, y - 1) * stride + x]; + int B = rec[y * stride + AOMMAX(0, x - 2)]; + int C = rec[y * stride + AOMMAX(0, x - 1)]; + int D = rec[y * stride + AOMMIN(width - 1, x + 1)]; + int E = rec[y * stride + AOMMIN(width - 1, x + 2)]; + int F = rec[AOMMIN(height - 1, y + 1) * stride + x]; + int delta = av1_clpf_sample(X, A, B, C, D, E, F, strength); + int Y = X + delta; + *sum0 += (O - X) * (O - X); + *sum1 += (O - Y) * (O - Y); + } + } +} + +static void detect_multi_clpf(const uint8_t *rec, const uint8_t *org, int x0, + int y0, int width, int height, int so, int stride, + int *sum) { + int x, y; + + for (y = y0; y < y0 + 8; y++) { + for (x = x0; x < x0 + 8; x++) { + int O = org[y * so + x]; + int X = rec[y * stride + x]; + int A = rec[AOMMAX(0, y - 1) * stride + x]; + int B = rec[y * stride + AOMMAX(0, x - 2)]; + int C = rec[y * stride + AOMMAX(0, x - 1)]; + int D = rec[y * stride + AOMMIN(width - 1, x + 1)]; + int E = rec[y * stride + AOMMIN(width - 1, x + 2)]; + int F = rec[AOMMIN(height - 1, y + 1) * stride + x]; + int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, 1); + int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, 2); + int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, 4); + int F1 = X + delta1; + int F2 = X + delta2; + int F3 = X + delta3; + sum[0] += (O - X) * (O - X); + sum[1] += (O - F1) * (O - F1); + sum[2] += (O - F2) * (O - F2); + sum[3] += (O - F3) * (O - F3); + } + } +} + +int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, + const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, + int block_size, int w, int h, unsigned int strength, + unsigned int fb_size_log2, uint8_t *res) { + int m, n, sum0 = 0, sum1 = 0; + for (m = 0; m < h; m++) { + for (n = 0; n < w; n++) { + int xpos = (l << fb_size_log2) + n * block_size; + int ypos = (k << fb_size_log2) + m * block_size; + const int bs = MAX_MIB_SIZE; + if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] + ->mbmi.skip) + detect_clpf(rec->y_buffer, org->y_buffer, xpos, ypos, rec->y_crop_width, + rec->y_crop_height, org->y_stride, rec->y_stride, &sum0, + &sum1, strength); + } + } + *res = sum1 < sum0; + return *res; +} + +// Calculate the square error of all filter settings. Result: +// res[0][0] : unfiltered +// res[0][1-3] : strength=1,2,4, no signals +// res[1][0] : (bit count, fb size = 128) +// res[1][1-3] : strength=1,2,4, fb size = 128 +// res[2][0] : (bit count, fb size = 64) +// res[2][1-3] : strength=1,2,4, fb size = 64 +// res[3][0] : (bit count, fb size = 32) +// res[3][1-3] : strength=1,2,4, fb size = 32 +static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec, + const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, + unsigned int block_size, unsigned int fb_size_log2, int w, + int h, int64_t res[4][4]) { + int i, m, n, filtered = 0; + int sum[4]; + int bslog = get_msb(block_size); + sum[0] = sum[1] = sum[2] = sum[3] = 0; + if (fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) { + int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered; + + fb_size_log2--; + w1 = AOMMIN(1 << (fb_size_log2 - bslog), w); + h1 = AOMMIN(1 << (fb_size_log2 - bslog), h); + w2 = AOMMIN(w - (1 << (fb_size_log2 - bslog)), w >> 1); + h2 = AOMMIN(h - (1 << (fb_size_log2 - bslog)), h >> 1); + i = get_msb(MAX_FB_SIZE) - fb_size_log2; + sum1 = res[i][1]; + sum2 = res[i][2]; + sum3 = res[i][3]; + oldfiltered = res[i][0]; + res[i][0] = 0; + + filtered = + clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1, res); + if (1 << (fb_size_log2 - bslog) < w) + filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size, + fb_size_log2, w2, h1, res); + if (1 << (fb_size_log2 - bslog) < h) { + filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size, + fb_size_log2, w1, h2, res); + filtered |= clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2), + rec, org, cm, block_size, fb_size_log2, w2, h2, res); + } + + res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]); + res[i][2] = AOMMIN(sum2 + res[i][0], res[i][2]); + res[i][3] = AOMMIN(sum3 + res[i][0], res[i][3]); + res[i][0] = oldfiltered + filtered; // Number of signal bits + return filtered; + } + + for (m = 0; m < h; m++) { + for (n = 0; n < w; n++) { + int xpos = x + n * block_size; + int ypos = y + m * block_size; + if (!cm->mi_grid_visible[ypos / MAX_MIB_SIZE * cm->mi_stride + + xpos / MAX_MIB_SIZE] + ->mbmi.skip) { + detect_multi_clpf(rec->y_buffer, org->y_buffer, xpos, ypos, + rec->y_crop_width, rec->y_crop_height, org->y_stride, + rec->y_stride, sum); + filtered = 1; + } + } + } + + for (i = 0; i < 4; i++) { + res[i][0] += sum[0]; + res[i][1] += sum[1]; + res[i][2] += sum[2]; + res[i][3] += sum[3]; + } + return filtered; +} + +void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec, + const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, + int *best_strength, int *best_bs) { + int i, j, k, l; + int64_t best, sums[4][4]; + int width = rec->y_crop_width, height = rec->y_crop_height; + const int bs = MAX_MIB_SIZE; + int fb_size_log2 = get_msb(MAX_FB_SIZE); + int num_fb_ver = (height + (1 << fb_size_log2) - bs) >> fb_size_log2; + int num_fb_hor = (width + (1 << fb_size_log2) - bs) >> fb_size_log2; + + memset(sums, 0, sizeof(sums)); + + for (k = 0; k < num_fb_ver; k++) { + for (l = 0; l < num_fb_hor; l++) { + // Calculate the block size after frame border clipping + int h = + AOMMIN(height, (k + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); + int w = + AOMMIN(width, (l + 1) << fb_size_log2) & ((1 << fb_size_log2) - 1); + h += !h << fb_size_log2; + w += !w << fb_size_log2; + clpf_rdo(k << fb_size_log2, l << fb_size_log2, rec, org, cm, bs, + fb_size_log2, w / bs, h / bs, sums); + } + } + + for (j = 0; j < 4; j++) { + static const double lambda_square[] = { + // exp((i - 15.4244) / 8.4010) + 0.159451, 0.179607, 0.202310, 0.227884, 0.256690, 0.289138, 0.325687, + 0.366856, 0.413230, 0.465465, 0.524303, 0.590579, 0.665233, 0.749323, + 0.844044, 0.950737, 1.070917, 1.206289, 1.358774, 1.530533, 1.724004, + 1.941931, 2.187406, 2.463911, 2.775368, 3.126195, 3.521370, 3.966498, + 4.467893, 5.032669, 5.668837, 6.385421, 7.192586, 8.101784, 9.125911, + 10.27949, 11.57890, 13.04256, 14.69124, 16.54832, 18.64016, 20.99641, + 23.65052, 26.64013, 30.00764, 33.80084, 38.07352, 42.88630, 48.30746, + 54.41389, 61.29221, 69.04002, 77.76720, 87.59756, 98.67056, 111.1432, + 125.1926, 141.0179, 158.8436, 178.9227, 201.5399, 227.0160, 255.7126, + 288.0366 + }; + + // Estimate the bit costs and adjust the square errors + double lambda = + lambda_square[av1_get_qindex(&cm->seg, 0, cm->base_qindex) >> 2]; + int i, cost = (int)((1.2 * lambda * (sums[j][0] + 2 + 2 * (j > 0)) + 0.5)); + for (i = 0; i < 4; i++) + sums[j][i] = ((sums[j][i] + (i && j) * cost) << 4) + j * 4 + i; + } + + best = (int64_t)1 << 62; + for (i = 0; i < 4; i++) + for (j = 0; j < 4; j++) + if ((!i || j) && sums[i][j] < best) best = sums[i][j]; + best &= 15; + *best_bs = (best > 3) * (5 + (best < 12) + (best < 8)); + *best_strength = best ? 1 << ((best - 1) & 3) : 0; +} diff --git a/av1/encoder/clpf_rdo.h b/av1/encoder/clpf_rdo.h new file mode 100644 index 000000000..3dd5478fc --- /dev/null +++ b/av1/encoder/clpf_rdo.h @@ -0,0 +1,26 @@ +/* + * Copyright (c) 2016, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#ifndef AV1_ENCODER_CLPF_H_ +#define AV1_ENCODER_CLPF_H_ + +#include "av1/common/reconinter.h" + +int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, + const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, + int block_size, int w, int h, unsigned int strength, + unsigned int fb_size_log2, uint8_t *res); + +void av1_clpf_test_frame(const YV12_BUFFER_CONFIG *rec, + const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, + int *best_strength, int *best_bs); + +#endif diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c index 935d65ea5..20cabb679 100644 --- a/av1/encoder/encoder.c +++ b/av1/encoder/encoder.c @@ -17,6 +17,7 @@ #include "av1/common/alloccommon.h" #if CONFIG_CLPF #include "av1/common/clpf.h" +#include "av1/encoder/clpf_rdo.h" #endif #if CONFIG_DERING #include "av1/common/dering.h" @@ -3420,57 +3421,56 @@ static void loopfilter_frame(AV1_COMP *cpi, AV1_COMMON *cm) { av1_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); #endif } -#if CONFIG_DERING - if (is_lossless_requested(&cpi->oxcf)) { - cm->dering_level = 0; - } else { - cm->dering_level = - av1_dering_search(cm->frame_to_show, cpi->Source, cm, xd); - av1_dering_frame(cm->frame_to_show, cm, xd, cm->dering_level); - } -#endif // CONFIG_DERING - #if CONFIG_CLPF - cm->clpf = 0; + cm->clpf_strength = 0; + cm->clpf_size = 2; + CHECK_MEM_ERROR( + cm, cm->clpf_blocks, + aom_malloc(((cm->frame_to_show->y_crop_width + 31) & ~31) * + ((cm->frame_to_show->y_crop_height + 31) & ~31) >> + 10)); if (!is_lossless_requested(&cpi->oxcf)) { // Test CLPF int i, hq = 1; - uint64_t before, after; // TODO(yaowu): investigate per-segment CLPF decision and // an optimal threshold, use 80 for now. for (i = 0; i < MAX_SEGMENTS; i++) hq &= av1_get_qindex(&cm->seg, i, cm->base_qindex) < 80; - if (!hq) { // Don't try filter if the entire image is nearly losslessly - // encoded -#if CLPF_FILTER_ALL_PLANES - aom_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf); - before = aom_get_y_sse(cpi->Source, cm->frame_to_show) + - aom_get_u_sse(cpi->Source, cm->frame_to_show) + - aom_get_v_sse(cpi->Source, cm->frame_to_show); - av1_clpf_frame(cm->frame_to_show, cm, xd); - after = aom_get_y_sse(cpi->Source, cm->frame_to_show) + - aom_get_u_sse(cpi->Source, cm->frame_to_show) + - aom_get_v_sse(cpi->Source, cm->frame_to_show); -#else - aom_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); - before = aom_get_y_sse(cpi->Source, cm->frame_to_show); - av1_clpf_frame(cm->frame_to_show, cm, xd); - after = aom_get_y_sse(cpi->Source, cm->frame_to_show); -#endif - if (before < after) { -// No improvement, restore original -#if CLPF_FILTER_ALL_PLANES - aom_yv12_copy_frame(&cpi->last_frame_uf, cm->frame_to_show); -#else - aom_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); -#endif + // Don't try filter if the entire image is nearly losslessly encoded + if (!hq) { + // Find the best strength and block size for the entire frame + int fb_size_log2, strength; + av1_clpf_test_frame(&cpi->last_frame_uf, cpi->Source, cm, &strength, + &fb_size_log2); + + if (!fb_size_log2) fb_size_log2 = get_msb(MAX_FB_SIZE); + + if (!strength) { // Better to disable for the whole frame? + cm->clpf_strength = 0; } else { - cm->clpf = 1; + // Apply the filter using the chosen strength + cm->clpf_strength = strength - (strength == 4); + cm->clpf_size = + fb_size_log2 ? fb_size_log2 - get_msb(MAX_FB_SIZE) + 3 : 0; + aom_yv12_copy_frame(cm->frame_to_show, &cpi->last_frame_uf); + cm->clpf_numblocks = + av1_clpf_frame(cm->frame_to_show, &cpi->last_frame_uf, cpi->Source, + cm, !!cm->clpf_size, strength, 4 + cm->clpf_size, + cm->clpf_blocks, av1_clpf_decision); } } } #endif +#if CONFIG_DERING + if (is_lossless_requested(&cpi->oxcf)) { + cm->dering_level = 0; + } else { + cm->dering_level = + av1_dering_search(cm->frame_to_show, cpi->Source, cm, xd); + av1_dering_frame(cm->frame_to_show, cm, xd, cm->dering_level); + } +#endif // CONFIG_DERING #if CONFIG_LOOP_RESTORATION if (cm->rst_info.restoration_type != RESTORE_NONE) { av1_loop_restoration_init(&cm->rst_internal, &cm->rst_info, @@ -4730,6 +4730,10 @@ static void encode_frame_to_data_rate(AV1_COMP *cpi, size_t *size, if (cm->show_frame) dump_filtered_recon_frames(cpi); #endif // DUMP_RECON_FRAMES +#if CONFIG_CLPF + aom_free(cm->clpf_blocks); +#endif + if (cm->seg.update_map) update_reference_segmentation_map(cpi); if (frame_is_intra_only(cm) == 0) {