]> granicus.if.org Git - libx264/commitdiff
x86: AVX-512 load_deinterleave_chroma_fenc
authorHenrik Gramner <henrik@gramner.com>
Sun, 8 Oct 2017 19:23:12 +0000 (21:23 +0200)
committerAnton Mitrofanov <BugMaster@narod.ru>
Sun, 24 Dec 2017 20:47:26 +0000 (23:47 +0300)
common/x86/mc-a2.asm
common/x86/mc-c.c

index a4e11616d2abcb7568cb22df94a11fcbfa603f54..69ed4cd4b924a38a38db9ffd0c4994fcfab1b655 100644 (file)
@@ -1264,17 +1264,27 @@ cglobal load_deinterleave_chroma_fenc, 4,5
     vbroadcasti128 m0, [deinterleave_shuf]
     lea            r4, [r2*3]
 .loop:
-    mova          xm1, [r1]
-    vinserti128    m1, m1, [r1+r2], 1
-    mova          xm2, [r1+r2*2]
-    vinserti128    m2, m2, [r1+r4], 1
+    mova          xm1, [r1]         ; 0
+    vinserti128   ym1, [r1+r2], 1   ; 1
+%if mmsize == 64
+    mova          xm2, [r1+r2*4]    ; 4
+    vinserti32x4   m1, [r1+r2*2], 2 ; 2
+    vinserti32x4   m2, [r1+r4*2], 2 ; 6
+    vinserti32x4   m1, [r1+r4], 3   ; 3
+    lea            r1, [r1+r2*4]
+    vinserti32x4   m2, [r1+r2], 1   ; 5
+    vinserti32x4   m2, [r1+r4], 3   ; 7
+%else
+    mova          xm2, [r1+r2*2]    ; 2
+    vinserti128    m2, [r1+r4], 1   ; 3
+%endif
+    lea            r1, [r1+r2*4]
     pshufb         m1, m0
     pshufb         m2, m0
-    mova [r0+0*FENC_STRIDE], m1
-    mova [r0+2*FENC_STRIDE], m2
-    lea            r1, [r1+r2*4]
-    add            r0, 4*FENC_STRIDE
-    sub           r3d, 4
+    mova         [r0], m1
+    mova  [r0+mmsize], m2
+    add            r0, 2*mmsize
+    sub           r3d, mmsize/8
     jg .loop
     RET
 %endmacro ; LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
@@ -1499,6 +1509,8 @@ PLANE_DEINTERLEAVE_RGB
 INIT_YMM avx2
 LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
 PLANE_DEINTERLEAVE_RGB
+INIT_ZMM avx512
+LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
 %endif
 
 ; These functions are not general-use; not only do they require aligned input, but memcpy
index 51764811ff6e3359f3cd177b459ffa37771e2435..0deb13875657f6c7587c9dff74639becf4590d2e 100644 (file)
@@ -245,6 +245,8 @@ void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intpt
 void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
 #define x264_load_deinterleave_chroma_fenc_avx2 x264_template(load_deinterleave_chroma_fenc_avx2)
 void x264_load_deinterleave_chroma_fenc_avx2( pixel *dst, pixel *src, intptr_t i_src, int height );
+#define x264_load_deinterleave_chroma_fenc_avx512 x264_template(load_deinterleave_chroma_fenc_avx512)
+void x264_load_deinterleave_chroma_fenc_avx512( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
 #define x264_load_deinterleave_chroma_fdec_sse2 x264_template(load_deinterleave_chroma_fdec_sse2)
 void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
 #define x264_load_deinterleave_chroma_fdec_ssse3 x264_template(load_deinterleave_chroma_fdec_ssse3)
@@ -909,6 +911,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     {
         pf->mc_luma = mc_luma_avx2;
         pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2;
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
         pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
     }
 
@@ -1068,6 +1071,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->integral_init4h = x264_integral_init4h_avx2;
         pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
         pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
     }
 
     if( cpu&X264_CPU_AVX512 )
@@ -1077,6 +1081,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_avx512;
         pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_avx512;
         pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_avx512;
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx512;
     }
 #endif // HIGH_BIT_DEPTH
 
@@ -1096,7 +1101,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->plane_copy_swap = plane_copy_swap_avx2;
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2;
     pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_avx2;
-    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
     pf->get_ref = get_ref_avx2;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
     pf->mbtree_propagate_list = mbtree_propagate_list_avx2;