vbroadcasti128 m0, [deinterleave_shuf]
lea r4, [r2*3]
.loop:
- mova xm1, [r1]
- vinserti128 m1, m1, [r1+r2], 1
- mova xm2, [r1+r2*2]
- vinserti128 m2, m2, [r1+r4], 1
+ mova xm1, [r1] ; 0
+ vinserti128 ym1, [r1+r2], 1 ; 1
+%if mmsize == 64
+ mova xm2, [r1+r2*4] ; 4
+ vinserti32x4 m1, [r1+r2*2], 2 ; 2
+ vinserti32x4 m2, [r1+r4*2], 2 ; 6
+ vinserti32x4 m1, [r1+r4], 3 ; 3
+ lea r1, [r1+r2*4]
+ vinserti32x4 m2, [r1+r2], 1 ; 5
+ vinserti32x4 m2, [r1+r4], 3 ; 7
+%else
+ mova xm2, [r1+r2*2] ; 2
+ vinserti128 m2, [r1+r4], 1 ; 3
+%endif
+ lea r1, [r1+r2*4]
pshufb m1, m0
pshufb m2, m0
- mova [r0+0*FENC_STRIDE], m1
- mova [r0+2*FENC_STRIDE], m2
- lea r1, [r1+r2*4]
- add r0, 4*FENC_STRIDE
- sub r3d, 4
+ mova [r0], m1
+ mova [r0+mmsize], m2
+ add r0, 2*mmsize
+ sub r3d, mmsize/8
jg .loop
RET
%endmacro ; LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
INIT_YMM avx2
LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
PLANE_DEINTERLEAVE_RGB
+INIT_ZMM avx512
+LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
%endif
; These functions are not general-use; not only do they require aligned input, but memcpy
void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
#define x264_load_deinterleave_chroma_fenc_avx2 x264_template(load_deinterleave_chroma_fenc_avx2)
void x264_load_deinterleave_chroma_fenc_avx2( pixel *dst, pixel *src, intptr_t i_src, int height );
+#define x264_load_deinterleave_chroma_fenc_avx512 x264_template(load_deinterleave_chroma_fenc_avx512)
+void x264_load_deinterleave_chroma_fenc_avx512( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
#define x264_load_deinterleave_chroma_fdec_sse2 x264_template(load_deinterleave_chroma_fdec_sse2)
void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
#define x264_load_deinterleave_chroma_fdec_ssse3 x264_template(load_deinterleave_chroma_fdec_ssse3)
{
pf->mc_luma = mc_luma_avx2;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
}
pf->integral_init4h = x264_integral_init4h_avx2;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
}
if( cpu&X264_CPU_AVX512 )
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_avx512;
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_avx512;
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_avx512;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx512;
}
#endif // HIGH_BIT_DEPTH
pf->plane_copy_swap = plane_copy_swap_avx2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2;
pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_avx2;
- pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
pf->get_ref = get_ref_avx2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
pf->mbtree_propagate_list = mbtree_propagate_list_avx2;