VSX version of vpx_post_proc_down_and_across_mb_row

author Luc Trudeau <luc@trud.ca>

Wed, 30 May 2018 02:08:18 +0000 (22:08 -0400)

committer Luc Trudeau <luc@trud.ca>

Thu, 31 May 2018 17:13:06 +0000 (13:13 -0400)
author Luc Trudeau <luc@trud.ca>
Wed, 30 May 2018 02:08:18 +0000 (22:08 -0400)
committer Luc Trudeau <luc@trud.ca>
Thu, 31 May 2018 17:13:06 +0000 (13:13 -0400)
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc

index ebed196c809f919e5f1d6f3b6aae04f8047ee72c..1fe0348fc8479811e4731b1b295f4887040274b1 100644 (file)
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -42,18 +42,36 @@ int q2mbl(int x) {
  }
  
  class VpxPostProcDownAndAcrossMbRowTest
-    : public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
+    : public AbstractBench,
+      public ::testing::TestWithParam<VpxPostProcDownAndAcrossMbRowFunc> {
   public:
+  VpxPostProcDownAndAcrossMbRowTest() : mbPostProcDownAndAcross(GetParam()) {}
    virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  const VpxPostProcDownAndAcrossMbRowFunc mbPostProcDownAndAcross;
+  // Size of the underlying data block that will be filtered.
+  int block_width;
+  int block_height;
+  Buffer<uint8_t> *src_image;
+  Buffer<uint8_t> *dst_image;
+  uint8_t *flimits;
+  void run();
  };
  
+void VpxPostProcDownAndAcrossMbRowTest::run() {
+  mbPostProcDownAndAcross(src_image->TopLeftPixel(), dst_image->TopLeftPixel(),
+                          src_image->stride(), dst_image->stride(), block_width,
+                          flimits, 16);
+}
+
  // Test routine for the VPx post-processing function
  // vpx_post_proc_down_and_across_mb_row_c.
  
  TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
    // Size of the underlying data block that will be filtered.
-  const int block_width = 16;
-  const int block_height = 16;
+  block_width = 16;
+  block_height = 16;
  
    // 5-tap filter needs 2 padding rows above and below the block in the input.
    Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2);
@@ -66,8 +84,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
        Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8);
    ASSERT_TRUE(dst_image.Init());
  
-  uint8_t *const flimits =
-      reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
    (void)memset(flimits, 255, block_width);
  
    // Initialize pixels in the input:
@@ -79,13 +96,12 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
    // Initialize pixels in the output to 99.
    dst_image.Set(99);
  
-  ASM_REGISTER_STATE_CHECK(GetParam()(
+  ASM_REGISTER_STATE_CHECK(mbPostProcDownAndAcross(
        src_image.TopLeftPixel(), dst_image.TopLeftPixel(), src_image.stride(),
        dst_image.stride(), block_width, flimits, 16));
  
-  static const uint8_t kExpectedOutput[block_height] = {
-    4, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 4
-  };
+  static const uint8_t kExpectedOutput[] = { 4, 3, 1, 1, 1, 1, 1, 1,
+                                             1, 1, 1, 1, 1, 1, 3, 4 };
  
    uint8_t *pixel_ptr = dst_image.TopLeftPixel();
    for (int i = 0; i < block_height; ++i) {
@@ -103,8 +119,8 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
    // Size of the underlying data block that will be filtered.
    // Y blocks are always a multiple of 16 wide and exactly 16 high. U and V
    // blocks are always a multiple of 8 wide and exactly 8 high.
-  const int block_width = 136;
-  const int block_height = 16;
+  block_width = 136;
+  block_height = 16;
  
    // 5-tap filter needs 2 padding rows above and below the block in the input.
    // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
@@ -127,8 +143,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
    // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so
    // it must be padded out.
    const int flimits_width = block_width % 16 ? block_width + 8 : block_width;
-  uint8_t *const flimits =
-      reinterpret_cast<uint8_t *>(vpx_memalign(16, flimits_width));
+  flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, flimits_width));
  
    ACMRandom rnd;
    rnd.Reset(ACMRandom::DeterministicSeed());
@@ -143,7 +158,6 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
  
      for (int f = 0; f < 255; f++) {
        (void)memset(flimits + blocks, f, sizeof(*flimits) * 8);
-
        dst_image.Set(0);
        dst_image_ref.Set(0);
  
@@ -151,10 +165,10 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
            src_image.TopLeftPixel(), dst_image_ref.TopLeftPixel(),
            src_image.stride(), dst_image_ref.stride(), block_width, flimits,
            block_height);
-      ASM_REGISTER_STATE_CHECK(
-          GetParam()(src_image.TopLeftPixel(), dst_image.TopLeftPixel(),
-                     src_image.stride(), dst_image.stride(), block_width,
-                     flimits, block_height));
+      ASM_REGISTER_STATE_CHECK(mbPostProcDownAndAcross(
+          src_image.TopLeftPixel(), dst_image.TopLeftPixel(),
+          src_image.stride(), dst_image.stride(), block_width, flimits,
+          block_height));
  
        ASSERT_TRUE(dst_image.CheckValues(dst_image_ref));
      }
@@ -163,6 +177,42 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
    vpx_free(flimits);
  }
  
+TEST_P(VpxPostProcDownAndAcrossMbRowTest, DISABLED_Speed) {
+  // Size of the underlying data block that will be filtered.
+  block_width = 16;
+  block_height = 16;
+
+  // 5-tap filter needs 2 padding rows above and below the block in the input.
+  Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2);
+  ASSERT_TRUE(src_image.Init());
+  this->src_image = &src_image;
+
+  // Filter extends output block by 8 samples at left and right edges.
+  // Though the left padding is only 8 bytes, the assembly code tries to
+  // read 16 bytes before the pointer.
+  Buffer<uint8_t> dst_image =
+      Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8);
+  ASSERT_TRUE(dst_image.Init());
+  this->dst_image = &dst_image;
+
+  flimits = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
+  (void)memset(flimits, 255, block_width);
+
+  // Initialize pixels in the input:
+  //   block pixels to value 1,
+  //   border pixels to value 10.
+  src_image.SetPadding(10);
+  src_image.Set(1);
+
+  // Initialize pixels in the output to 99.
+  dst_image.Set(99);
+
+  runNTimes(INT16_MAX);
+  printMedian("16x16");
+
+  vpx_free(flimits);
+};
+
  class VpxMbPostProcAcrossIpTest
      : public AbstractBench,
        public ::testing::TestWithParam<VpxMbPostProcAcrossIpFunc> {
@@ -500,6 +550,9 @@ INSTANTIATE_TEST_CASE_P(MSA, VpxMbPostProcDownTest,
  #endif  // HAVE_MSA
  
  #if HAVE_VSX
+INSTANTIATE_TEST_CASE_P(
+    VSX, VpxPostProcDownAndAcrossMbRowTest,
+    ::testing::Values(vpx_post_proc_down_and_across_mb_row_vsx));
  
  INSTANTIATE_TEST_CASE_P(VSX, VpxMbPostProcAcrossIpTest,
                          ::testing::Values(vpx_mbpost_proc_across_ip_vsx));
diff --git a/vpx_dsp/ppc/deblock_vsx.c b/vpx_dsp/ppc/deblock_vsx.c

index de19a4601c22751bbc2f273d3dd8140bbb3c93ec..4329081ee6333bae2da7302dfa81d7224729dd24 100644 (file)
--- a/vpx_dsp/ppc/deblock_vsx.c
+++ b/vpx_dsp/ppc/deblock_vsx.c
@@ -19,6 +19,106 @@ static const uint8x16_t load_merge = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A,
                                         0x0C, 0x0E, 0x18, 0x19, 0x1A, 0x1B,
                                         0x1C, 0x1D, 0x1E, 0x1F };
  
+static const uint8x16_t st8_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+                                     0x06, 0x07, 0x18, 0x19, 0x1A, 0x1B,
+                                     0x1C, 0x1D, 0x1E, 0x1F };
+
+static INLINE uint8x16_t vec_abd_s8(uint8x16_t a, uint8x16_t b) {
+  return vec_sub(vec_max(a, b), vec_min(a, b));
+}
+
+static INLINE uint8x16_t apply_filter(uint8x16_t ctx[4], uint8x16_t v,
+                                      uint8x16_t filter) {
+  const uint8x16_t k1 = vec_avg(ctx[0], ctx[1]);
+  const uint8x16_t k2 = vec_avg(ctx[3], ctx[2]);
+  const uint8x16_t k3 = vec_avg(k1, k2);
+  const uint8x16_t f_a = vec_max(vec_abd_s8(v, ctx[0]), vec_abd_s8(v, ctx[1]));
+  const uint8x16_t f_b = vec_max(vec_abd_s8(v, ctx[2]), vec_abd_s8(v, ctx[3]));
+  const bool8x16_t mask = vec_cmplt(vec_max(f_a, f_b), filter);
+  return vec_sel(v, vec_avg(k3, v), mask);
+}
+
+static INLINE void vert_ctx(uint8x16_t ctx[4], int col, uint8_t *src,
+                            int stride) {
+  ctx[0] = vec_vsx_ld(col - 2 * stride, src);
+  ctx[1] = vec_vsx_ld(col - stride, src);
+  ctx[2] = vec_vsx_ld(col + stride, src);
+  ctx[3] = vec_vsx_ld(col + 2 * stride, src);
+}
+
+static INLINE void horz_ctx(uint8x16_t ctx[4], uint8x16_t left_ctx,
+                            uint8x16_t v, uint8x16_t right_ctx) {
+  static const uint8x16_t l2_perm = { 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13,
+                                      0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+                                      0x1A, 0x1B, 0x1C, 0x1D };
+
+  static const uint8x16_t l1_perm = { 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
+                                      0x15, 0x16, 0x17, 0x18, 0x19, 0x1A,
+                                      0x1B, 0x1C, 0x1D, 0x1E };
+
+  static const uint8x16_t r1_perm = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
+                                      0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C,
+                                      0x0D, 0x0E, 0x0F, 0x10 };
+
+  static const uint8x16_t r2_perm = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                                      0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+                                      0x0E, 0x0F, 0x10, 0x11 };
+  ctx[0] = vec_perm(left_ctx, v, l2_perm);
+  ctx[1] = vec_perm(left_ctx, v, l1_perm);
+  ctx[2] = vec_perm(v, right_ctx, r1_perm);
+  ctx[3] = vec_perm(v, right_ctx, r2_perm);
+}
+void vpx_post_proc_down_and_across_mb_row_vsx(unsigned char *src_ptr,
+                                              unsigned char *dst_ptr,
+                                              int src_pixels_per_line,
+                                              int dst_pixels_per_line, int cols,
+                                              unsigned char *f, int size) {
+  int row, col;
+  uint8x16_t ctx[4], out, v, left_ctx;
+
+  for (row = 0; row < size; row++) {
+    for (col = 0; col < cols - 8; col += 16) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      v = vec_vsx_ld(col, src_ptr);
+      vert_ctx(ctx, col, src_ptr, src_pixels_per_line);
+      vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr);
+    }
+
+    if (col != cols) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      v = vec_vsx_ld(col, src_ptr);
+      vert_ctx(ctx, col, src_ptr, src_pixels_per_line);
+      out = apply_filter(ctx, v, filter);
+      vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr);
+    }
+
+    /* now post_proc_across */
+    left_ctx = vec_splats(dst_ptr[0]);
+    v = vec_vsx_ld(0, dst_ptr);
+    for (col = 0; col < cols - 8; col += 16) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      const uint8x16_t right_ctx = (col + 16 == cols)
+                                       ? vec_splats(dst_ptr[cols - 1])
+                                       : vec_vsx_ld(col, dst_ptr + 16);
+      horz_ctx(ctx, left_ctx, v, right_ctx);
+      vec_vsx_st(apply_filter(ctx, v, filter), col, dst_ptr);
+      left_ctx = v;
+      v = right_ctx;
+    }
+
+    if (col != cols) {
+      const uint8x16_t filter = vec_vsx_ld(col, f);
+      const uint8x16_t right_ctx = vec_splats(dst_ptr[cols - 1]);
+      horz_ctx(ctx, left_ctx, v, right_ctx);
+      out = apply_filter(ctx, v, filter);
+      vec_vsx_st(vec_perm(out, v, st8_perm), col, dst_ptr);
+    }
+
+    src_ptr += src_pixels_per_line;
+    dst_ptr += dst_pixels_per_line;
+  }
+}
+
  // C: s[c + 7]
  static INLINE int16x8_t next7l_s16(uint8x16_t c) {
    static const uint8x16_t next7_perm = {
diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h

index ca925f940d808e6ec50a542a381f70224cf0f43c..c6c7ce9f14709a60b51f0412444380fd6bafbee5 100644 (file)
--- a/vpx_dsp/ppc/types_vsx.h
+++ b/vpx_dsp/ppc/types_vsx.h
@@ -19,6 +19,7 @@ typedef vector signed short int16x8_t;
  typedef vector unsigned short uint16x8_t;
  typedef vector signed int int32x4_t;
  typedef vector unsigned int uint32x4_t;
+typedef vector bool char bool8x16_t;
  typedef vector bool short bool16x8_t;
  typedef vector bool int bool32x4_t;
  
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl

index 52aabb4fdf3a900bc4a0e095efe03218e5e507ba..9661f3bd8e1a3d237780e7ef98d192bfa765a72c 100644 (file)
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1604,7 +1604,7 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC")
      specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa vsx/;
  
      add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size";
-    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/;
+    specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa vsx/;
  
  }
author	Luc Trudeau <luc@trud.ca>
	Wed, 30 May 2018 02:08:18 +0000 (22:08 -0400)
committer	Luc Trudeau <luc@trud.ca>
	Thu, 31 May 2018 17:13:06 +0000 (13:13 -0400)
test/pp_filter_test.cc		patch \| blob \| history
vpx_dsp/ppc/deblock_vsx.c		patch \| blob \| history
vpx_dsp/ppc/types_vsx.h		patch \| blob \| history
vpx_dsp/vpx_dsp_rtcd_defs.pl		patch \| blob \| history