From: Steinar Midtskogen Date: Mon, 26 Sep 2016 10:51:25 +0000 (+0200) Subject: Clean up and speed up CLPF clipping X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e66fc87c463825d69a12e88e6b7f92754eac2311;p=libvpx Clean up and speed up CLPF clipping * Move clipping tests from inside to outside loops * Let sizex and sizey to clpf_block() be the clipped block size rather than both just bs * Make fallback tests to C more accurate Change-Id: Icdc57540ce21b41a95403fdcc37988a4ebf546c7 --- diff --git a/av1/common/clpf.c b/av1/common/clpf.c index 9eef2b50a..1cf52724d 100644 --- a/av1/common/clpf.c +++ b/av1/common/clpf.c @@ -153,8 +153,11 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, // Iterate over all smaller blocks inside the filter block for (m = 0; m < ((h + bs - 1) >> bslog); m++) { for (n = 0; n < ((w + bs - 1) >> bslog); n++) { + int sizex, sizey; xpos = xoff + n * bs; ypos = yoff + m * bs; + sizex = AOMMIN(width - xpos, bs); + sizey = AOMMIN(height - ypos, bs); if (!cm->mi_grid_visible[(ypos << suby) / MI_SIZE * cm->mi_stride + (xpos << subx) / MI_SIZE] ->mbmi.skip) { // Not skip block @@ -164,30 +167,49 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, #if CONFIG_AOM_HIGHBITDEPTH if (cm->use_highbitdepth) { uint16_t *const d = CONVERT_TO_SHORTPTR(cache_dst[cache_idx]); - for (c = 0; c < bs; c++) { - *(uint64_t *)(d + c * sstride) = - *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); - if (bs == 8) + if (sizex == 8) { + for (c = 0; c < sizey; c++) { + *(uint64_t *)(d + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); *(uint64_t *)(d + c * sstride + 4) = *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2 + 8); + } + } else if (sizex == 4) { + for (c = 0; c < sizey; c++) + *(uint64_t *)(d + c * sstride) = + *(uint64_t *)(cache_ptr[cache_idx] + c * bs * 2); + } else { + for (c = 0; c < sizey; c++) + memcpy(d + c * sstride, cache_ptr[cache_idx] + c * bs * 2, + sizex); } } else { - for (c = 0; c < bs; c++) - if (bs == 8) + if (sizex == 8) + for (c = 0; c < sizey; c++) *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = *(uint64_t *)(cache_ptr[cache_idx] + c * bs); - else + else if (sizex == 4) + for (c = 0; c < sizey; c++) *(uint32_t *)(cache_dst[cache_idx] + c * sstride) = *(uint32_t *)(cache_ptr[cache_idx] + c * bs); + else + for (c = 0; c < sizey; c++) + memcpy(cache_dst[cache_idx] + c * sstride, + cache_ptr[cache_idx] + c * bs, sizex); } #else - for (c = 0; c < bs; c++) - if (bs == 8) + if (sizex == 8) + for (c = 0; c < sizey; c++) *(uint64_t *)(cache_dst[cache_idx] + c * sstride) = *(uint64_t *)(cache_ptr[cache_idx] + c * bs); - else + else if (sizex == 4) + for (c = 0; c < sizey; c++) *(uint32_t *)(cache_dst[cache_idx] + c * sstride) = *(uint32_t *)(cache_ptr[cache_idx] + c * bs); + else + for (c = 0; c < sizey; c++) + memcpy(cache_dst[cache_idx] + c * sstride, + cache_ptr[cache_idx] + c * bs, sizex); #endif } #if CONFIG_AOM_HIGHBITDEPTH @@ -211,15 +233,15 @@ int av1_clpf_frame(const YV12_BUFFER_CONFIG *frame, if (cm->use_highbitdepth) { aom_clpf_block_hbd(CONVERT_TO_SHORTPTR(src_buffer), CONVERT_TO_SHORTPTR(dst_buffer), sstride, - dstride, xpos, ypos, bs, bs, width, height, - strength); + dstride, xpos, ypos, sizex, sizey, width, + height, strength); } else { aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos, - ypos, bs, bs, width, height, strength); + ypos, sizex, sizey, width, height, strength); } #else aom_clpf_block(src_buffer, dst_buffer, sstride, dstride, xpos, - ypos, bs, bs, width, height, strength); + ypos, sizex, sizey, width, height, strength); #endif } } diff --git a/av1/common/clpf_simd.h b/av1/common/clpf_simd.h index 979856b49..6fef4b7b5 100644 --- a/av1/common/clpf_simd.h +++ b/av1/common/clpf_simd.h @@ -76,24 +76,27 @@ static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride, v128 o = v128_from_v64(l1, l2); const v128 a = v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1); - v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0), - v64_load_unaligned(src - 2 * !!x0 + sstride)); - v128 c = v128_from_v64(v64_load_unaligned(src - !!x0), - v64_load_unaligned(src - !!x0 + sstride)); - v128 d = v128_from_v64(v64_load_unaligned(src + !!right), - v64_load_unaligned(src + !!right + sstride)); - v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right), - v64_load_unaligned(src + 2 * !!right + sstride)); const v128 f = v128_from_v64( l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)); + v128 b, c, d, e; - if (!x0) { // Left clipping - b = v128_shuffle_8(b, v128_load_aligned(b_shuff)); - c = v128_shuffle_8(c, v128_load_aligned(c_shuff)); + if (x0) { + b = v128_from_v64(v64_load_unaligned(src - 2), + v64_load_unaligned(src - 2 + sstride)); + c = v128_from_v64(v64_load_unaligned(src - 1), + v64_load_unaligned(src - 1 + sstride)); + } else { // Left clipping + b = v128_shuffle_8(o, v128_load_aligned(b_shuff)); + c = v128_shuffle_8(o, v128_load_aligned(c_shuff)); } - if (!right) { // Right clipping - d = v128_shuffle_8(d, v128_load_aligned(d_shuff)); - e = v128_shuffle_8(e, v128_load_aligned(e_shuff)); + if (right) { + d = v128_from_v64(v64_load_unaligned(src + 1), + v64_load_unaligned(src + 1 + sstride)); + e = v128_from_v64(v64_load_unaligned(src + 2), + v64_load_unaligned(src + 2 + sstride)); + } else { // Right clipping + d = v128_shuffle_8(o, v128_load_aligned(d_shuff)); + e = v128_shuffle_8(o, v128_load_aligned(e_shuff)); } o = calc_delta(o, a, b, c, d, e, f, sp, sm); @@ -134,31 +137,34 @@ static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride, const uint32_t l5 = u32_load_aligned(src + ((y != bottom) + 3) * sstride); v128 o = v128_from_32(l1, l2, l3, l4); const v128 a = v128_from_32(l0, l1, l2, l3); - v128 b = v128_from_32(u32_load_unaligned(src - 2 * !!x0), - u32_load_unaligned(src + sstride - 2 * !!x0), - u32_load_unaligned(src + 2 * sstride - 2 * !!x0), - u32_load_unaligned(src + 3 * sstride - 2 * !!x0)); - v128 c = v128_from_32(u32_load_unaligned(src - !!x0), - u32_load_unaligned(src + sstride - !!x0), - u32_load_unaligned(src + 2 * sstride - !!x0), - u32_load_unaligned(src + 3 * sstride - !!x0)); - v128 d = v128_from_32(u32_load_unaligned(src + !!right), - u32_load_unaligned(src + sstride + !!right), - u32_load_unaligned(src + 2 * sstride + !!right), - u32_load_unaligned(src + 3 * sstride + !!right)); - v128 e = v128_from_32(u32_load_unaligned(src + 2 * !!right), - u32_load_unaligned(src + sstride + 2 * !!right), - u32_load_unaligned(src + 2 * sstride + 2 * !!right), - u32_load_unaligned(src + 3 * sstride + 2 * !!right)); const v128 f = v128_from_32(l2, l3, l4, l5); + v128 b, c, d, e; - if (!x0) { // Left clipping - b = v128_shuffle_8(b, v128_load_aligned(b_shuff)); - c = v128_shuffle_8(c, v128_load_aligned(c_shuff)); + if (x0) { + b = v128_from_32(u32_load_unaligned(src - 2), + u32_load_unaligned(src + sstride - 2), + u32_load_unaligned(src + 2 * sstride - 2), + u32_load_unaligned(src + 3 * sstride - 2)); + c = v128_from_32(u32_load_unaligned(src - 1), + u32_load_unaligned(src + sstride - 1), + u32_load_unaligned(src + 2 * sstride - 1), + u32_load_unaligned(src + 3 * sstride - 1)); + } else { // Left clipping + b = v128_shuffle_8(o, v128_load_aligned(b_shuff)); + c = v128_shuffle_8(o, v128_load_aligned(c_shuff)); } - if (!right) { // Right clipping - d = v128_shuffle_8(d, v128_load_aligned(d_shuff)); - e = v128_shuffle_8(e, v128_load_aligned(e_shuff)); + if (right) { + d = v128_from_32(u32_load_unaligned(src + 1), + u32_load_unaligned(src + sstride + 1), + u32_load_unaligned(src + 2 * sstride + 1), + u32_load_unaligned(src + 3 * sstride + 1)); + e = v128_from_32(u32_load_unaligned(src + 2 * !!right), + u32_load_unaligned(src + sstride + 2), + u32_load_unaligned(src + 2 * sstride + 2), + u32_load_unaligned(src + 3 * sstride + 2)); + } else { // Right clipping + d = v128_shuffle_8(o, v128_load_aligned(d_shuff)); + e = v128_shuffle_8(o, v128_load_aligned(e_shuff)); } o = calc_delta(o, a, b, c, d, e, f, sp, sm); @@ -176,9 +182,10 @@ void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength) { - if ((sizex != 4 && sizex != 8) || y0 + 4 > height || - (sizey & 3 && sizex == 4) || x0 + 4 > width) { - // Fallback to C for odd sizes + if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) { + // Fallback to C for odd sizes: + // * block widths not 4 or 8 + // * block heights not a multiple of 4 if the block width is 4 aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width, height, strength); } else { @@ -255,24 +262,27 @@ SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst, v128 o = v128_from_v64(l1, l2); const v128 a = v128_from_v64(v64_load_aligned(src - (y != -y0) * sstride), l1); - v128 b = v128_from_v64(v64_load_unaligned(src - 2 * !!x0), - v64_load_unaligned(src - 2 * !!x0 + sstride)); - v128 c = v128_from_v64(v64_load_unaligned(src - !!x0), - v64_load_unaligned(src - !!x0 + sstride)); - v128 d = v128_from_v64(v64_load_unaligned(src + !!right), - v64_load_unaligned(src + !!right + sstride)); - v128 e = v128_from_v64(v64_load_unaligned(src + 2 * !!right), - v64_load_unaligned(src + 2 * !!right + sstride)); const v128 f = v128_from_v64( l2, v64_load_aligned(src + ((y != bottom) + 1) * sstride)); + v128 b, c, d, e; - if (!x0) { // Left clipping - b = v128_shuffle_8(b, v128_load_aligned(b_shuff)); - c = v128_shuffle_8(c, v128_load_aligned(c_shuff)); + if (x0) { + b = v128_from_v64(v64_load_unaligned(src - 2), + v64_load_unaligned(src - 2 + sstride)); + c = v128_from_v64(v64_load_unaligned(src - 1), + v64_load_unaligned(src - 1 + sstride)); + } else { // Left clipping + b = v128_shuffle_8(o, v128_load_aligned(b_shuff)); + c = v128_shuffle_8(o, v128_load_aligned(c_shuff)); } - if (!right) { // Right clipping - d = v128_shuffle_8(d, v128_load_aligned(d_shuff)); - e = v128_shuffle_8(e, v128_load_aligned(e_shuff)); + if (right) { + d = v128_from_v64(v64_load_unaligned(src + 1), + v64_load_unaligned(src + 1 + sstride)); + e = v128_from_v64(v64_load_unaligned(src + 2), + v64_load_unaligned(src + 2 + sstride)); + } else { // Right clipping + d = v128_shuffle_8(o, v128_load_aligned(d_shuff)); + e = v128_shuffle_8(o, v128_load_aligned(e_shuff)); } calc_delta_hbd4(o, a, b, c, d, e, f, dst, sp, sm, dstride); src += sstride * 2; @@ -309,18 +319,21 @@ SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride, const v128 o = v128_load_aligned(src); const v128 a = v128_load_aligned(src - (y != -y0) * sstride); const v128 f = v128_load_aligned(src + (y - 1 != bottom) * sstride); - v128 b = v128_load_unaligned(src - 2 * !!x0); - v128 c = v128_load_unaligned(src - !!x0); - v128 d = v128_load_unaligned(src + !!right); - v128 e = v128_load_unaligned(src + 2 * !!right); + v128 b, c, d, e; - if (!x0) { // Left clipping - b = v128_shuffle_8(b, v128_load_aligned(b_shuff)); - c = v128_shuffle_8(c, v128_load_aligned(c_shuff)); + if (x0) { + b = v128_load_unaligned(src - 2); + c = v128_load_unaligned(src - 1); + } else { // Left clipping + b = v128_shuffle_8(o, v128_load_aligned(b_shuff)); + c = v128_shuffle_8(o, v128_load_aligned(c_shuff)); } - if (!right) { // Right clipping - d = v128_shuffle_8(d, v128_load_aligned(d_shuff)); - e = v128_shuffle_8(e, v128_load_aligned(e_shuff)); + if (right) { + d = v128_load_unaligned(src + 1); + e = v128_load_unaligned(src + 2); + } else { // Right clipping + d = v128_shuffle_8(o, v128_load_aligned(d_shuff)); + e = v128_shuffle_8(o, v128_load_aligned(e_shuff)); } calc_delta_hbd8(o, a, b, c, d, e, f, dst, sp, sm); src += sstride; @@ -332,8 +345,10 @@ void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, int width, int height, unsigned int strength) { - if ((sizex != 4 && sizex != 8) || y0 + 4 > height || x0 + 4 > width) { - // Fallback to C for odd sizes + if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) { + // Fallback to C for odd sizes: + // * block width not 4 or 8 + // * block heights not a multiple of 2 if the block width is 4 aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, width, height, strength); } else {