ppc: AltiVec plane_copy_deinterleave_rgb

author Alexandra Hájková <alexandra@khirnov.net>

Wed, 7 Dec 2016 19:48:02 +0000 (19:48 +0000)

committer Henrik Gramner <henrik@gramner.com>

Mon, 23 Jan 2017 21:46:03 +0000 (22:46 +0100)
author Alexandra Hájková <alexandra@khirnov.net>
Wed, 7 Dec 2016 19:48:02 +0000 (19:48 +0000)
committer Henrik Gramner <henrik@gramner.com>
Mon, 23 Jan 2017 21:46:03 +0000 (22:46 +0100)
diff --git a/common/ppc/mc.c b/common/ppc/mc.c

index 08998f2067e443f34e2dc31e59724a8beb14c969..5737ec695c6cb697ad602f611afa2166216bccd4 100644 (file)
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -198,6 +198,70 @@ void x264_store_interleave_chroma_altivec( uint8_t *dst, intptr_t i_dst,
      }
  }
  
+#if HAVE_VSX
+void x264_plane_copy_deinterleave_rgb_altivec( uint8_t *dsta, intptr_t i_dsta,
+                                               uint8_t *dstb, intptr_t i_dstb,
+                                               uint8_t *dstc, intptr_t i_dstc,
+                                               uint8_t *src, intptr_t i_src,
+                                               int pw, int w, int h )
+{
+    if( pw == 3 )
+    {
+        const vec_u8_t mask[4] = {
+            { 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0x12, 0x15, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0x10, 0x13, 0x16 },
+            { 0x08, 0x0B, 0x0E, 0x11, 0x14, 0x17, 0x1A, 0x1D, 0x09, 0x0C, 0x0F, 0x12, 0x15, 0x18, 0x1B, 0x1E },
+            { 0x02, 0x05, 0x08, 0x0B, 0x0E, 0x11, 0x14, 0x17, 0x1A, 0x1D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
+            { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x13, 0x16, 0x19, 0x1C, 0x1F }
+        };
+
+        for( int y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src )
+        {
+            for( int x = 0; x < w; x += 16 )
+            {
+                vec_u8_t srcv1 = vec_vsx_ld( 3 * x, src );
+                vec_u8_t srcv2 = vec_vsx_ld( 3 * x + 16, src );
+                vec_u8_t srcv3 = vec_vsx_ld( 3 * x + 32, src );
+                vec_u64_t tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[0] ); // a0  a1  a2  a3  a4  a5  a6  a7  b0  b1  b2  b3  b4  b5  b6  b7
+                vec_u64_t tmp2 = (vec_u64_t)vec_perm( srcv2, srcv3, mask[1] ); // a8  a9  a10 a11 a12 a13 a14 a15 b8  b9  b10 b11 b12 b13 b14 b15
+                vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dsta );
+                vec_st( (vec_u8_t)vec_mergel( tmp1, tmp2 ), x, dstb );
+
+                srcv1 = vec_perm( srcv1, srcv2, mask[2] );          // c0  c1  c2  c3  c4  c5  c6  c7  c8  c9
+                srcv1 = vec_perm( srcv1, srcv3, mask[3] );          // c0  c1  c2  c3  c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+                vec_st( srcv1, x, dstc );
+            }
+        }
+    }
+    else
+    {
+        const vec_u8_t mask[2] = {
+            { 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, 0x01, 0x05, 0x09, 0x0D, 0x11, 0x15, 0x19, 0x1D },
+            { 0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F, 0x13, 0x17, 0x1B, 0x1F }
+        };
+
+        for( int y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src )
+        {
+            for( int x = 0; x < w; x += 16 )
+            {
+                vec_u8_t srcv1 = vec_vsx_ld( 4 * x, src );
+                vec_u8_t srcv2 = vec_vsx_ld( 4 * x + 16, src );
+                vec_u8_t srcv3 = vec_vsx_ld( 4 * x + 32, src );
+                vec_u8_t srcv4 = vec_vsx_ld( 4 * x + 48, src );
+
+                vec_u64_t tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[0] ); // a0  a1  a2  a3  a4  a5  a6  a7  b0  b1  b2  b3  b4  b5  b6  b7
+                vec_u64_t tmp2 = (vec_u64_t)vec_perm( srcv3, srcv4, mask[0] ); // a8  a9  a10 a11 a12 a13 a14 a15 b8  b9  b10 b11 b12 b13 b14 b15
+                vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dsta );
+                vec_st( (vec_u8_t)vec_mergel( tmp1, tmp2 ), x, dstb );
+
+                tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[1] );           // c0  c1  c2  c3  c4  c5  c6  c7
+                tmp2 = (vec_u64_t)vec_perm( srcv3, srcv4, mask[1] );           // c8  c9  c10 c11 c12 c13 c14 c15
+                vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dstc );
+            }
+        }
+    }
+}
+#endif
+
  static void mc_luma_altivec( uint8_t *dst,    intptr_t i_dst_stride,
                               uint8_t *src[4], intptr_t i_src_stride,
                               int mvx, int mvy,
@@ -1232,5 +1296,8 @@ void x264_mc_init_altivec( x264_mc_functions_t *pf )
      pf->plane_copy_swap = x264_plane_copy_swap_altivec;
      pf->plane_copy_interleave = x264_plane_copy_interleave_altivec;
      pf->store_interleave_chroma = x264_store_interleave_chroma_altivec;
+#if HAVE_VSX
+    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_altivec;
+#endif // HAVE_VSX
  #endif // !HIGH_BIT_DEPTH
  }
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h

index bfc3c7d0b80b5f748865b219c91dbe63082d9176..855298bb90c77c462c964980a867e55208dd3ca8 100644 (file)
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -45,12 +45,31 @@
  #define vec_s16_t vector signed short
  #define vec_u32_t vector unsigned int
  #define vec_s32_t vector signed int
+#if HAVE_VSX
+#define vec_u64_t vector unsigned long long
+#define vec_s64_t vector signed long long
+
+typedef union {
+  uint64_t s[2];
+  vec_u64_t v;
+} vec_u64_u;
+
+typedef union {
+  int64_t s[2];
+  vec_s64_t v;
+} vec_s64_u;
+#endif
  
  typedef union {
    uint32_t s[4];
    vec_u32_t v;
  } vec_u32_u;
  
+typedef union {
+  int32_t s[4];
+  vec_s32_t v;
+} vec_s32_u;
+
  typedef union {
    uint16_t s[8];
    vec_u16_t v;
@@ -66,6 +85,11 @@ typedef union {
    vec_u8_t v;
  } vec_u8_u;
  
+typedef union {
+  int8_t s[16];
+  vec_s8_t v;
+} vec_s8_u;
+
  /***********************************************************************
   * Null vector
   **********************************************************************/
author	Alexandra Hájková <alexandra@khirnov.net>
	Wed, 7 Dec 2016 19:48:02 +0000 (19:48 +0000)
committer	Henrik Gramner <henrik@gramner.com>
	Mon, 23 Jan 2017 21:46:03 +0000 (22:46 +0100)
common/ppc/mc.c		patch \| blob \| history
common/ppc/ppccommon.h		patch \| blob \| history