From: Luc Trudeau Date: Wed, 30 May 2018 02:08:18 +0000 (-0400) Subject: VSX version of vpx_post_proc_down_and_across_mb_row X-Git-Tag: v1.8.0~647 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=656e8ac61;p=libvpx VSX version of vpx_post_proc_down_and_across_mb_row Low bit depth version only. Passes the VpxPostProcDownAndAcrossMbRowTest VpxMbPostProcAcrossIpTest Speed Test (POWER8 Model 2.1) C time = 121.3 ms (±4.0 ms), VSX time = 9.4 ms (±0.3 ms) [12.9x] Change-Id: I28300779e197ea3855cf30867d17a2805388b447 --- diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index ebed196c8..1fe0348fc 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -42,18 +42,36 @@ int q2mbl(int x) { } class VpxPostProcDownAndAcrossMbRowTest - : public ::testing::TestWithParam { + : public AbstractBench, + public ::testing::TestWithParam { public: + VpxPostProcDownAndAcrossMbRowTest() : mbPostProcDownAndAcross(GetParam()) {} virtual void TearDown() { libvpx_test::ClearSystemState(); } + + protected: + const VpxPostProcDownAndAcrossMbRowFunc mbPostProcDownAndAcross; + // Size of the underlying data block that will be filtered. + int block_width; + int block_height; + Buffer *src_image; + Buffer *dst_image; + uint8_t *flimits; + void run(); }; +void VpxPostProcDownAndAcrossMbRowTest::run() { + mbPostProcDownAndAcross(src_image->TopLeftPixel(), dst_image->TopLeftPixel(), + src_image->stride(), dst_image->stride(), block_width, + flimits, 16); +} + // Test routine for the VPx post-processing function // vpx_post_proc_down_and_across_mb_row_c. TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { // Size of the underlying data block that will be filtered. - const int block_width = 16; - const int block_height = 16; + block_width = 16; + block_height = 16; // 5-tap filter needs 2 padding rows above and below the block in the input. Buffer src_image = Buffer(block_width, block_height, 2); @@ -66,8 +84,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { Buffer(block_width, block_height, 8, 16, 8, 8); ASSERT_TRUE(dst_image.Init()); - uint8_t *const flimits = - reinterpret_cast(vpx_memalign(16, block_width)); + flimits = reinterpret_cast(vpx_memalign(16, block_width)); (void)memset(flimits, 255, block_width); // Initialize pixels in the input: @@ -79,13 +96,12 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) { // Initialize pixels in the output to 99. dst_image.Set(99); - ASM_REGISTER_STATE_CHECK(GetParam()( + ASM_REGISTER_STATE_CHECK(mbPostProcDownAndAcross( src_image.TopLeftPixel(), dst_image.TopLeftPixel(), src_image.stride(), dst_image.stride(), block_width, flimits, 16)); - static const uint8_t kExpectedOutput[block_height] = { - 4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4 - }; + static const uint8_t kExpectedOutput[] = { 4, 3, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 3, 4 }; uint8_t *pixel_ptr = dst_image.TopLeftPixel(); for (int i = 0; i < block_height; ++i) { @@ -103,8 +119,8 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { // Size of the underlying data block that will be filtered. // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V // blocks are always a multiple of 8 wide and exactly 8 high. - const int block_width = 136; - const int block_height = 16; + block_width = 136; + block_height = 16; // 5-tap filter needs 2 padding rows above and below the block in the input. // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16. @@ -127,8 +143,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so // it must be padded out. const int flimits_width = block_width % 16 ? block_width + 8 : block_width; - uint8_t *const flimits = - reinterpret_cast(vpx_memalign(16, flimits_width)); + flimits = reinterpret_cast(vpx_memalign(16, flimits_width)); ACMRandom rnd; rnd.Reset(ACMRandom::DeterministicSeed()); @@ -143,7 +158,6 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { for (int f = 0; f < 255; f++) { (void)memset(flimits + blocks, f, sizeof(*flimits) * 8); - dst_image.Set(0); dst_image_ref.Set(0); @@ -151,10 +165,10 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { src_image.TopLeftPixel(), dst_image_ref.TopLeftPixel(), src_image.stride(), dst_image_ref.stride(), block_width, flimits, block_height); - ASM_REGISTER_STATE_CHECK( - GetParam()(src_image.TopLeftPixel(), dst_image.TopLeftPixel(), - src_image.stride(), dst_image.stride(), block_width, - flimits, block_height)); + ASM_REGISTER_STATE_CHECK(mbPostProcDownAndAcross( + src_image.TopLeftPixel(), dst_image.TopLeftPixel(), + src_image.stride(), dst_image.stride(), block_width, flimits, + block_height)); ASSERT_TRUE(dst_image.CheckValues(dst_image_ref)); } @@ -163,6 +177,42 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) { vpx_free(flimits); } +TEST_P(VpxPostProcDownAndAcrossMbRowTest, DISABLED_Speed) { + // Size of the underlying data block that will be filtered. + block_width = 16; + block_height = 16; + + // 5-tap filter needs 2 padding rows above and below the block in the input. + Buffer src_image = Buffer(block_width, block_height, 2); + ASSERT_TRUE(src_image.Init()); + this->src_image = &src_image; + + // Filter extends output block by 8 samples at left and right edges. + // Though the left padding is only 8 bytes, the assembly code tries to + // read 16 bytes before the pointer. + Buffer dst_image = + Buffer(block_width, block_height, 8, 16, 8, 8); + ASSERT_TRUE(dst_image.Init()); + this->dst_image = &dst_image; + + flimits = reinterpret_cast(vpx_memalign(16, block_width)); + (void)memset(flimits, 255, block_width); + + // Initialize pixels in the input: + // block pixels to value 1, + // border pixels to value 10. + src_image.SetPadding(10); + src_image.Set(1); + + // Initialize pixels in the output to 99. + dst_image.Set(99); + + runNTimes(INT16_MAX); + printMedian("16x16"); + + vpx_free(flimits); +}; + class VpxMbPostProcAcrossIpTest : public AbstractBench, public ::testing::TestWithParam { @@ -500,6 +550,9 @@ INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest, #endif // HAVE_MSA #if HAVE_VSX +INSTANTIATE_TEST_CASE_P( + VSX, VpxPostProcDownAndAcrossMbRowTest, + ::testing::Values(vpx_post_proc_down_and_across_mb_row_vsx)); INSTANTIATE_TEST_CASE_P(VSX, VpxMbPostProcAcrossIpTest, ::testing::Values(vpx_mbpost_proc_across_ip_vsx)); diff --git a/vpx_dsp/ppc/deblock_vsx.c b/vpx_dsp/ppc/deblock_vsx.c index de19a4601..4329081ee 100644 --- a/vpx_dsp/ppc/deblock_vsx.c +++ b/vpx_dsp/ppc/deblock_vsx.c @@ -19,6 +19,106 @@ static const uint8x16_t load_merge = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }; +static const uint8x16_t st8_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B, + 0x1C, 0x1D, 0x1E, 0x1F }; + +static INLINE uint8x16_t vec_abd_s8(uint8x16_t a, uint8x16_t b) { + return vec_sub(vec_max(a, b), vec_min(a, b)); +} + +static INLINE uint8x16_t apply_filter(uint8x16_t ctx[4], uint8x16_t v, + uint8x16_t filter) { + const uint8x16_t k1 = vec_avg(ctx[0], ctx[1]); + const uint8x16_t k2 = vec_avg(ctx[3], ctx[2]); + const uint8x16_t k3 = vec_avg(k1, k2); + const uint8x16_t f_a = vec_max(vec_abd_s8(v, ctx[0]), vec_abd_s8(v, ctx[1])); + const uint8x16_t f_b = vec_max(vec_abd_s8(v, ctx[2]), vec_abd_s8(v, ctx[3])); + const bool8x16_t mask = vec_cmplt(vec_max(f_a, f_b), filter); + return vec_sel(v, vec_avg(k3, v), mask); +} + +static INLINE void vert_ctx(uint8x16_t ctx[4], int col, uint8_t *src, + int stride) { + ctx[0] = vec_vsx_ld(col - 2 * stride, src); + ctx[1] = vec_vsx_ld(col - stride, src); + ctx[2] = vec_vsx_ld(col + stride, src); + ctx[3] = vec_vsx_ld(col + 2 * stride, src); +} + +static INLINE void horz_ctx(uint8x16_t ctx[4], uint8x16_t left_ctx, + uint8x16_t v, uint8x16_t right_ctx) { + static const uint8x16_t l2_perm = { 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x1A, 0x1B, 0x1C, 0x1D }; + + static const uint8x16_t l1_perm = { 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, + 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, + 0x1B, 0x1C, 0x1D, 0x1E }; + + static const uint8x16_t r1_perm = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, + 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, + 0x0D, 0x0E, 0x0F, 0x10 }; + + static const uint8x16_t r2_perm = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x10, 0x11 }; + ctx[0] = vec_perm(left_ctx, v, l2_perm); + ctx[1] = vec_perm(left_ctx, v, l1_perm); + ctx[2] = vec_perm(v, right_ctx, r1_perm); + ctx[3] = vec_perm(v, right_ctx, r2_perm); +} +void vpx_post_proc_down_and_across_mb_row_vsx(unsigned char *src_ptr, + unsigned char *dst_ptr, + int src_pixels_per_line, + int dst_pixels_per_line, int cols, + unsigned char *f, int size) { + int row, col; + uint8x16_t ctx[4], out, v, left_ctx; + + for (row = 0; row < size; row++) { + for (col = 0; col < cols - 8; col += 16) { + const uint8x16_t filter = vec_vsx_ld(col, f); + v = vec_vsx_ld(col, src_ptr); + vert_ctx(ctx, col, src_ptr, src_pixels_per_line); + vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr); + } + + if (col != cols) { + const uint8x16_t filter = vec_vsx_ld(col, f); + v = vec_vsx_ld(col, src_ptr); + vert_ctx(ctx, col, src_ptr, src_pixels_per_line); + out = apply_filter(ctx, v, filter); + vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr); + } + + /* now post_proc_across */ + left_ctx = vec_splats(dst_ptr[0]); + v = vec_vsx_ld(0, dst_ptr); + for (col = 0; col < cols - 8; col += 16) { + const uint8x16_t filter = vec_vsx_ld(col, f); + const uint8x16_t right_ctx = (col + 16 == cols) + ? vec_splats(dst_ptr[cols - 1]) + : vec_vsx_ld(col, dst_ptr + 16); + horz_ctx(ctx, left_ctx, v, right_ctx); + vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr); + left_ctx = v; + v = right_ctx; + } + + if (col != cols) { + const uint8x16_t filter = vec_vsx_ld(col, f); + const uint8x16_t right_ctx = vec_splats(dst_ptr[cols - 1]); + horz_ctx(ctx, left_ctx, v, right_ctx); + out = apply_filter(ctx, v, filter); + vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr); + } + + src_ptr += src_pixels_per_line; + dst_ptr += dst_pixels_per_line; + } +} + // C: s[c + 7] static INLINE int16x8_t next7l_s16(uint8x16_t c) { static const uint8x16_t next7_perm = { diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h index ca925f940..c6c7ce9f1 100644 --- a/vpx_dsp/ppc/types_vsx.h +++ b/vpx_dsp/ppc/types_vsx.h @@ -19,6 +19,7 @@ typedef vector signed short int16x8_t; typedef vector unsigned short uint16x8_t; typedef vector signed int int32x4_t; typedef vector unsigned int uint32x4_t; +typedef vector bool char bool8x16_t; typedef vector bool short bool16x8_t; typedef vector bool int bool32x4_t; diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 52aabb4fd..9661f3bd8 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1604,7 +1604,7 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/; add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"; - specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/; + specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa vsx/; }