From: Steinar Midtskogen Date: Wed, 24 Aug 2016 11:00:04 +0000 (+0200) Subject: Reduce memory footprint for CLPF decoding. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e8224c7ad5747d1888ddb181839f205a1752afe0;p=libvpx Reduce memory footprint for CLPF decoding. Instead of having CLPF write to an entire new frame and copy the result back into the original frame, make the filter able to work in-place by keeping a buffer of size frame_width*filter_block_size and delay the write-back by one filter_block_size row. This reduces the cycles spent in the filter to ~75%. Change-Id: I78ca74380c45492daa8935d08d766851edb5fbc1 --- diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index a2b9a75d7..5f7384be7 100644 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl @@ -587,7 +587,7 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; if (aom_config("CONFIG_CLPF") eq "yes") { - add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int stride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength"; + add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength"; specialize qw/aom_clpf_block sse2 ssse3 sse4_1 neon/; add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength"; specialize qw/aom_clpf_detect sse2 ssse3 sse4_1 neon/; diff --git a/av1/common/clpf.c b/av1/common/clpf.c index 799af0184..1ca60e056 100644 --- a/av1/common/clpf.c +++ b/av1/common/clpf.c @@ -27,30 +27,30 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b) { return (8 + delta - (delta < 0)) >> 4; } -void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int stride, int x0, - int y0, int sizex, int sizey, int width, int height, - unsigned int strength) { +void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride, + int dstride, int x0, int y0, int sizex, int sizey, + int width, int height, unsigned int strength) { int x, y; for (y = y0; y < y0 + sizey; y++) { for (x = x0; x < x0 + sizex; x++) { - int X = src[y * stride + x]; - int A = src[AOMMAX(0, y - 1) * stride + x]; - int B = src[y * stride + AOMMAX(0, x - 2)]; - int C = src[y * stride + AOMMAX(0, x - 1)]; - int D = src[y * stride + AOMMIN(width - 1, x + 1)]; - int E = src[y * stride + AOMMIN(width - 1, x + 2)]; - int F = src[AOMMIN(height - 1, y + 1) * stride + x]; + int X = src[y * sstride + x]; + int A = src[AOMMAX(0, y - 1) * sstride + x]; + int B = src[y * sstride + AOMMAX(0, x - 2)]; + int C = src[y * sstride + AOMMAX(0, x - 1)]; + int D = src[y * sstride + AOMMIN(width - 1, x + 1)]; + int E = src[y * sstride + AOMMIN(width - 1, x + 2)]; + int F = src[AOMMIN(height - 1, y + 1) * sstride + x]; int delta; delta = av1_clpf_sample(X, A, B, C, D, E, F, strength); - dst[y * stride + x] = X + delta; + dst[y * dstride + x] = X + delta; } } } // Return number of filtered blocks -int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, - const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, - int enable_fb_flag, unsigned int strength, +int av1_clpf_frame(const YV12_BUFFER_CONFIG *orig_dst, + const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, + AV1_COMMON *cm, int enable_fb_flag, unsigned int strength, unsigned int fb_size_log2, uint8_t *blocks, int (*decision)(int, int, const YV12_BUFFER_CONFIG *, const YV12_BUFFER_CONFIG *, @@ -59,23 +59,45 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, /* Constrained low-pass filter (CLPF) */ int c, k, l, m, n; const int bs = MI_SIZE; - int width = rec->y_crop_width; - int height = rec->y_crop_height; + const int width = rec->y_crop_width; + const int height = rec->y_crop_height; int xpos, ypos; - int stride_y = rec->y_stride; - int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2; - int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2; + const int sstride = rec->y_stride; + int dstride = orig_dst->y_stride; + const int num_fb_hor = (width + (1 << fb_size_log2) - 1) >> fb_size_log2; + const int num_fb_ver = (height + (1 << fb_size_log2) - 1) >> fb_size_log2; int block_index = 0; + uint8_t *cache = NULL; + uint8_t **cache_ptr = NULL; + uint8_t **cache_dst = NULL; + int cache_idx = 0; + const int cache_size = num_fb_hor << (2 * fb_size_log2); + const int cache_blocks = cache_size / (bs * bs); + YV12_BUFFER_CONFIG dst = *orig_dst; + + // Make buffer space for in-place filtering + if (rec->y_buffer == dst.y_buffer) { + CHECK_MEM_ERROR(cm, cache, aom_malloc(cache_size)); + CHECK_MEM_ERROR(cm, cache_ptr, + aom_malloc(cache_blocks * sizeof(*cache_ptr))); + CHECK_MEM_ERROR(cm, cache_dst, + aom_malloc(cache_blocks * sizeof(*cache_dst))); + memset(cache_ptr, 0, cache_blocks * sizeof(*cache_dst)); + dst.y_buffer = cache; + dstride = bs; + } // Iterate over all filter blocks for (k = 0; k < num_fb_ver; k++) { for (l = 0; l < num_fb_hor; l++) { int h, w; int allskip = 1; + const int xoff = l << fb_size_log2; + const int yoff = k << fb_size_log2; for (m = 0; allskip && m < (1 << fb_size_log2) / bs; m++) { for (n = 0; allskip && n < (1 << fb_size_log2) / bs; n++) { - xpos = (l << fb_size_log2) + n * bs; - ypos = (k << fb_size_log2) + m * bs; + xpos = xoff + n * bs; + ypos = yoff + m * bs; if (xpos < width && ypos < height) { allskip &= cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] @@ -96,31 +118,57 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, // Iterate over all smaller blocks inside the filter block for (m = 0; m < (h + bs - 1) / bs; m++) { for (n = 0; n < (w + bs - 1) / bs; n++) { - xpos = (l << fb_size_log2) + n * bs; - ypos = (k << fb_size_log2) + m * bs; + xpos = xoff + n * bs; + ypos = yoff + m * bs; if (!cm->mi_grid_visible[ypos / bs * cm->mi_stride + xpos / bs] - ->mbmi.skip) { - // Not skip block, apply the filter - aom_clpf_block(rec->y_buffer, dst->y_buffer, stride_y, xpos, ypos, - bs, bs, width, height, strength); + ->mbmi.skip) { // Not skip block + // Temporary buffering needed if filtering in-place + if (cache) { + if (cache_ptr[cache_idx]) { + // Copy filtered block back into the frame + for (c = 0; c < bs; c++) + *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs); + } + cache_ptr[cache_idx] = cache + cache_idx * bs * bs; + dst.y_buffer = cache_ptr[cache_idx] - ypos * bs - xpos; + cache_dst[cache_idx] = rec->y_buffer + ypos * sstride + xpos; + if (++cache_idx >= cache_blocks) cache_idx = 0; + } + + // Apply the filter + aom_clpf_block(rec->y_buffer, dst.y_buffer, sstride, dstride, + xpos, ypos, bs, bs, width, height, strength); + } else { // Skip block, copy instead - for (c = 0; c < bs; c++) - *(uint64_t *)(dst->y_buffer + (ypos + c) * stride_y + xpos) = - *(uint64_t *)(rec->y_buffer + (ypos + c) * stride_y + xpos); + if (!cache) + for (c = 0; c < bs; c++) + *(uint64_t *)(dst.y_buffer + (ypos + c) * dstride + xpos) = *( + uint64_t *)(rec->y_buffer + (ypos + c) * sstride + xpos); } } } } else { // Entire filter block is skip, copy - for (m = 0; m < h; m++) - memcpy(dst->y_buffer + ((k << fb_size_log2) + m) * stride_y + - (l << fb_size_log2), - rec->y_buffer + ((k << fb_size_log2) + m) * stride_y + - (l << fb_size_log2), - w); + if (!cache) + for (m = 0; m < h; m++) + memcpy(dst.y_buffer + (yoff + m) * dstride + xoff, + rec->y_buffer + (yoff + m) * sstride + xoff, w); } block_index += !allskip; // Count number of blocks filtered } } + if (cache) { + // Copy remaining blocks into the frame + for (cache_idx = 0; cache_idx < cache_blocks && cache_ptr[cache_idx]; + cache_idx++) + for (c = 0; c < bs; c++) + *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs); + + aom_free(cache); + aom_free(cache_ptr); + } + return block_index; } diff --git a/av1/common/clpf.h b/av1/common/clpf.h index 21671a1c1..2fb12d6c6 100644 --- a/av1/common/clpf.h +++ b/av1/common/clpf.h @@ -18,7 +18,7 @@ int av1_clpf_maxbits(const AV1_COMMON *cm); int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int b); int av1_clpf_frame(const YV12_BUFFER_CONFIG *dst, const YV12_BUFFER_CONFIG *rec, - const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, + const YV12_BUFFER_CONFIG *org, AV1_COMMON *cm, int enable_fb_flag, unsigned int strength, unsigned int fb_size_log2, uint8_t *blocks, int (*decision)(int, int, const YV12_BUFFER_CONFIG *, diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h index 0df6cd74e..544aa36f7 100644 --- a/av1/common/clpf_simd.h +++ b/av1/common/clpf_simd.h @@ -11,11 +11,11 @@ #include "./aom_dsp_rtcd.h" -static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, - int y0, int sizey, int width, int height, - unsigned int strength) { - dst += x0 + y0 * stride; - src += x0 + y0 * stride; +static void clpf_block(const uint8_t *src, uint8_t *dst, int sstride, + int dstride, int x0, int y0, int sizey, int width, + int height, unsigned int strength) { + dst += x0 + y0 * dstride; + src += x0 + y0 * sstride; { int bottom = height - 2 - y0; const v128 sp = v128_dup_8(strength); @@ -32,23 +32,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, for (y = 0; y < sizey; y += 2) { const v64 l1 = v64_load_aligned(src); - const v64 l2 = v64_load_aligned(src + stride); + const v64 l2 = v64_load_aligned(src + sstride); v128 o = v128_from_v64(l1, l2); const v128 x = v128_add_8(c128, o); const v128 a = v128_add_8( c128, - v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1)); + v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); const v128 b = v128_shuffle_8(x, b_shuff); const v128 c = v128_shuffle_8(x, c_shuff); const v128 d = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src + 1), - v64_load_unaligned(src + 1 + stride))); + v64_load_unaligned(src + 1 + sstride))); const v128 e = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src + 2), - v64_load_unaligned(src + 2 + stride))); + v64_load_unaligned(src + 2 + sstride))); const v128 f = v128_add_8( c128, v128_from_v64(l2, v64_load_aligned( - src + ((y != bottom) + 1) * stride))); + src + ((y != bottom) + 1) * sstride))); const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), @@ -70,9 +70,9 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, delta, v128_zero()))), 4)); v64_store_aligned(dst, v128_high_v64(o)); - v64_store_aligned(dst + stride, v128_low_v64(o)); - src += stride * 2; - dst += stride * 2; + v64_store_aligned(dst + dstride, v128_low_v64(o)); + src += sstride * 2; + dst += dstride * 2; } } else if (!(width - x0 - 8)) { // Clip right const v128 d_shuff = v128_from_v64(v64_from_64(0x0f0f0e0d0c0b0a09LL), @@ -83,23 +83,23 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, for (y = 0; y < sizey; y += 2) { const v64 l1 = v64_load_aligned(src); - const v64 l2 = v64_load_aligned(src + stride); + const v64 l2 = v64_load_aligned(src + sstride); v128 o = v128_from_v64(l1, l2); const v128 x = v128_add_8(c128, o); const v128 a = v128_add_8( c128, - v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1)); + v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); const v128 b = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src - 2), - v64_load_unaligned(src - 2 + stride))); + v64_load_unaligned(src - 2 + sstride))); const v128 c = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src - 1), - v64_load_unaligned(src - 1 + stride))); + v64_load_unaligned(src - 1 + sstride))); const v128 d = v128_shuffle_8(x, d_shuff); const v128 e = v128_shuffle_8(x, e_shuff); const v128 f = v128_add_8( c128, v128_from_v64(l2, v64_load_aligned( - src + ((y != bottom) + 1) * stride))); + src + ((y != bottom) + 1) * sstride))); const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), @@ -121,35 +121,35 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, delta, v128_zero()))), 4)); v64_store_aligned(dst, v128_high_v64(o)); - v64_store_aligned(dst + stride, v128_low_v64(o)); - src += stride * 2; - dst += stride * 2; + v64_store_aligned(dst + dstride, v128_low_v64(o)); + src += sstride * 2; + dst += dstride * 2; } } else { // No left/right clipping int y; for (y = 0; y < sizey; y += 2) { const v64 l1 = v64_load_aligned(src); - const v64 l2 = v64_load_aligned(src + stride); + const v64 l2 = v64_load_aligned(src + sstride); v128 o = v128_from_v64(l1, l2); const v128 x = v128_add_8(c128, o); const v128 a = v128_add_8( c128, - v128_from_v64(v64_load_aligned(src - (y != -y0) * stride), l1)); + v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1)); const v128 b = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src - 2), - v64_load_unaligned(src - 2 + stride))); + v64_load_unaligned(src - 2 + sstride))); const v128 c = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src - 1), - v64_load_unaligned(src - 1 + stride))); + v64_load_unaligned(src - 1 + sstride))); const v128 d = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src + 1), - v64_load_unaligned(src + 1 + stride))); + v64_load_unaligned(src + 1 + sstride))); const v128 e = v128_add_8( c128, v128_from_v64(v64_load_unaligned(src + 2), - v64_load_unaligned(src + 2 + stride))); + v64_load_unaligned(src + 2 + sstride))); const v128 f = v128_add_8( c128, v128_from_v64(l2, v64_load_aligned( - src + ((y != bottom) + 1) * stride))); + src + ((y != bottom) + 1) * sstride))); const v128 tmp = v128_add_8(v128_max_s8(v128_min_s8(v128_ssub_s8(c, x), sp), sm), @@ -171,17 +171,18 @@ static void clpf_block(const uint8_t *src, uint8_t *dst, int stride, int x0, delta, v128_zero()))), 4)); v64_store_aligned(dst, v128_high_v64(o)); - v64_store_aligned(dst + stride, v128_low_v64(o)); - src += stride * 2; - dst += stride * 2; + v64_store_aligned(dst + dstride, v128_low_v64(o)); + src += sstride * 2; + dst += dstride * 2; } } } } -void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride, - int x0, int y0, int sizex, int sizey, int width, - int height, unsigned int strength) { +void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride, + int dstride, int x0, int y0, int sizex, + int sizey, int width, int height, + unsigned int strength) { // TODO(stemidts): // A sizex different from 8 will only be needed if CLPF is extended to chroma. // This will only be used if 4:2:0 and width not a multiple of 16 and along @@ -189,9 +190,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int stride, // this case. If not extended to chroma, this test will be redundant. if (sizex != 8 || width < 16 || y0 + 8 > height || x0 + 8 > width) { // Fallback to C for odd sizes - aom_clpf_block_c(src, dst, stride, x0, y0, sizex, sizey, width, height, - strength); + aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width, + height, strength); } else { - clpf_block(src, dst, stride, x0, y0, sizey, width, height, strength); + clpf_block(src, dst, sstride, dstride, x0, y0, sizey, width, height, + strength); } } diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c index dc18944b3..6b2de8c59 100644 --- a/av1/decoder/decodeframe.c +++ b/av1/decoder/decodeframe.c @@ -3929,19 +3929,10 @@ void av1_decode_frame(AV1Decoder *pbi, const uint8_t *data, #if CONFIG_CLPF if (cm->clpf_strength && !cm->skip_loop_filter) { - YV12_BUFFER_CONFIG dst; // Buffer for the result - - dst = pbi->cur_buf->buf; - CHECK_MEM_ERROR(cm, dst.y_buffer, aom_malloc(dst.y_stride * dst.y_height)); - - av1_clpf_frame(&dst, &pbi->cur_buf->buf, 0, cm, !!cm->clpf_size, + const YV12_BUFFER_CONFIG *const frame = &pbi->cur_buf->buf; + av1_clpf_frame(frame, frame, 0, cm, !!cm->clpf_size, cm->clpf_strength + (cm->clpf_strength == 3), 4 + cm->clpf_size, cm->clpf_blocks, clpf_bit); - - // Copy result - memcpy(pbi->cur_buf->buf.y_buffer, dst.y_buffer, - dst.y_height * dst.y_stride); - aom_free(dst.y_buffer); } if (cm->clpf_blocks) aom_free(cm->clpf_blocks); #endif diff --git a/test/clpf_test.cc b/test/clpf_test.cc index 786180b6a..755d1f146 100644 --- a/test/clpf_test.cc +++ b/test/clpf_test.cc @@ -26,9 +26,9 @@ using libaom_test::ACMRandom; namespace { -typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int stride, - int x0, int y0, int sizex, int sizey, int width, - int height, unsigned int strength); +typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride, + int dstride, int x0, int y0, int sizex, int sizey, + int width, int height, unsigned int strength); typedef std::tr1::tuple clpf_block_param_t; @@ -85,10 +85,10 @@ TEST_P(ClpfBlockTest, TestSIMDNoMismatch) { for (ypos = 0; ypos < size && !error; ypos += h * !error) { for (xpos = 0; xpos < size && !error; xpos += w * !error) { for (strength = 0; strength < 3 && !error; strength += !error) { - ref_clpf(s, ref_d, size, xpos, ypos, w, h, size, size, + ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, size, size, 1 << strength); - ASM_REGISTER_STATE_CHECK( - clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength)); + ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w, h, + size, size, 1 << strength)); for (pos = 0; pos < size * size && !error; pos++) { error = ref_d[pos] != d[pos]; @@ -137,7 +137,8 @@ TEST_P(ClpfSpeedTest, TestSpeed) { for (ypos = 0; ypos < size; ypos += h) { for (xpos = 0; xpos < size; xpos += w) { for (strength = 0; strength < 3; strength++) { - ref_clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength); + ref_clpf(s, d, size, size, xpos, ypos, w, h, size, size, + 1 << strength); } } } @@ -150,7 +151,7 @@ TEST_P(ClpfSpeedTest, TestSpeed) { for (ypos = 0; ypos < size; ypos += h) { for (xpos = 0; xpos < size; xpos += w) { for (strength = 0; strength < 3; strength++) { - clpf(s, d, size, xpos, ypos, w, h, size, size, 1 << strength); + clpf(s, d, size, size, xpos, ypos, w, h, size, size, 1 << strength); } } }