x86: AVX2 load_deinterleave_chroma_fenc

author Henrik Gramner <henrik@gramner.com>

Wed, 18 Jan 2017 20:46:55 +0000 (21:46 +0100)

committer Henrik Gramner <henrik@gramner.com>

Sat, 21 Jan 2017 13:10:37 +0000 (14:10 +0100)
author Henrik Gramner <henrik@gramner.com>
Wed, 18 Jan 2017 20:46:55 +0000 (21:46 +0100)
committer Henrik Gramner <henrik@gramner.com>
Sat, 21 Jan 2017 13:10:37 +0000 (14:10 +0100)
diff --git a/common/common.h b/common/common.h

index f26868eb50426fbfa4bec65fc124f3bec4daa211..bce186d5524c3f7d91d0a7f9d866935fb7d1caae 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -781,7 +781,7 @@ struct x264_t
              /* space for p_fenc and p_fdec */
  #define FENC_STRIDE 16
  #define FDEC_STRIDE 32
-            ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] );
+            ALIGNED_N( pixel fenc_buf[48*FENC_STRIDE] );
              ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
  
              /* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm

index f39645a16697e2f0eebe48f5816ab3a81fe89901..c4aff28c7e1e41155169ebe36e5ee7f38a3cf677 100644 (file)
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1250,6 +1250,26 @@ cglobal load_deinterleave_chroma_fdec, 4,4
      RET
  %endmacro ; LOAD_DEINTERLEAVE_CHROMA
  
+%macro LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 0
+cglobal load_deinterleave_chroma_fenc, 4,5
+    vbroadcasti128 m0, [deinterleave_shuf]
+    lea            r4, [r2*3]
+.loop:
+    mova          xm1, [r1]
+    vinserti128    m1, m1, [r1+r2], 1
+    mova          xm2, [r1+r2*2]
+    vinserti128    m2, m2, [r1+r4], 1
+    pshufb         m1, m0
+    pshufb         m2, m0
+    mova [r0+0*FENC_STRIDE], m1
+    mova [r0+2*FENC_STRIDE], m2
+    lea            r1, [r1+r2*4]
+    add            r0, 4*FENC_STRIDE
+    sub           r3d, 4
+    jg .loop
+    RET
+%endmacro ; LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
+
  %macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2
  %if mmsize == 32
      vbroadcasti128 m3, [deinterleave_rgb_shuf+(%1-3)*16]
@@ -1444,6 +1464,7 @@ PLANE_DEINTERLEAVE
  LOAD_DEINTERLEAVE_CHROMA
  PLANE_DEINTERLEAVE_RGB
  INIT_YMM avx2
+LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
  PLANE_DEINTERLEAVE_RGB
  %endif
  
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c

index d8fbafcd593d733f86c95d94c23ba83c192623d6..8f1bca56c5b33e408c2dbb4c607ef1efd3f2660c 100644 (file)
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -138,6 +138,7 @@ void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu,
  void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
  void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
  void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_avx2( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
  void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
  void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
  void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
@@ -823,6 +824,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
          pf->integral_init8h = x264_integral_init8h_avx2;
          pf->integral_init4h = x264_integral_init4h_avx2;
          pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
          pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
      }
  #endif // HIGH_BIT_DEPTH
diff --git a/encoder/analyse.c b/encoder/analyse.c

index ff655521b0db04bc154b02d32d6f3a2601848579..1941bf289e37294c1e265c74a80b264393cb80fb 100644 (file)
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -2147,7 +2147,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
              }
              else
              {
-                ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
+                ALIGNED_ARRAY_N( pixel, pixuv, [2],[16*FENC_STRIDE] );
                  int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
                  int v_shift = CHROMA_V_SHIFT;
  
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c

index 3c5357a589a7072302b23e344bf212eb7444952b..dbccb277c894c4e2a0a7b4b787a77122227ef6d8 100644 (file)
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -243,7 +243,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
      stride <<= b_field;
      if( b_chroma )
      {
-        ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*16] );
+        ALIGNED_ARRAY_N( pixel, pix,[FENC_STRIDE*16] );
          int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
          int shift = 7 - CHROMA_V_SHIFT;
author	Henrik Gramner <henrik@gramner.com>
	Wed, 18 Jan 2017 20:46:55 +0000 (21:46 +0100)
committer	Henrik Gramner <henrik@gramner.com>
	Sat, 21 Jan 2017 13:10:37 +0000 (14:10 +0100)
common/common.h		patch \| blob \| history
common/x86/mc-a2.asm		patch \| blob \| history
common/x86/mc-c.c		patch \| blob \| history
encoder/analyse.c		patch \| blob \| history
encoder/ratecontrol.c		patch \| blob \| history