From 4c48f9e751e969188d606eb15aeada7f652c9db9 Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Sat, 29 Apr 2017 14:26:40 +0200 Subject: [PATCH] x86: AVX-512 pixel_var_8x8, 8x16, and 16x16 Make the SSE2, AVX, and AVX2 versions a bit faster. Drop the MMX and XOP versions. --- common/pixel.c | 19 ++-- common/x86/pixel-a.asm | 224 +++++++++++++++++++++++++++-------------- common/x86/pixel.h | 9 +- 3 files changed, 158 insertions(+), 94 deletions(-) diff --git a/common/pixel.c b/common/pixel.c index c33a873f..82652e96 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -884,9 +884,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT4( hadamard_ac, _mmx2 ); INIT8( ssd, _mmx2 ); INIT_ADS( _mmx2 ); - - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2; #if ARCH_X86 pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_mmx2; @@ -1028,8 +1025,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT5( sad_x3, _xop ); INIT5( sad_x4, _xop ); pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; pixf->vsad = x264_pixel_vsad_xop; pixf->asd8 = x264_pixel_asd8_xop; #if ARCH_X86_64 @@ -1048,6 +1043,11 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx2; pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_avx2; } + if( cpu&X264_CPU_AVX512 ) + { + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512; + } #endif // HAVE_MMX #else // !HIGH_BIT_DEPTH #if HAVE_MMX @@ -1067,9 +1067,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT7( satd_x4, _mmx2 ); INIT4( hadamard_ac, _mmx2 ); INIT_ADS( _mmx2 ); - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2; - pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_mmx2; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2; #if ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2; @@ -1321,9 +1318,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop; pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop; pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop; - pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop; - pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop; - pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop; pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_xop; pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_xop; #if ARCH_X86_64 @@ -1356,6 +1350,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) { INIT8( satd, _avx512 ); pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512; + pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx512; + pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512; + pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512; } #endif //HAVE_MMX diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 9b3dc27b..082cb430 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -32,6 +32,8 @@ %include "x86util.asm" SECTION_RODATA 32 +var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1 + db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1 hmul_16p: times 16 db 1 times 8 db 1, -1 hmul_8p: times 8 db 1 @@ -701,25 +703,32 @@ SSD_NV12 %if HIGH_BIT_DEPTH == 0 %if %1 mova m7, [pw_00ff] -%elif mmsize < 32 +%elif mmsize == 16 pxor m7, m7 ; zero %endif %endif ; !HIGH_BIT_DEPTH %endmacro -%macro VAR_END 2 -%if HIGH_BIT_DEPTH && mmsize == 8 && %1*%2 == 256 - HADDUW m5, m2 -%else - HADDW m5, m2 +%macro VAR_END 0 + pmaddwd m5, [pw_1] + SBUTTERFLY dq, 5, 6, 0 + paddd m5, m6 +%if mmsize == 32 + vextracti128 xm6, m5, 1 + paddd xm5, xm6 %endif - HADDD m6, m1 + MOVHL xm6, xm5 + paddd xm5, xm6 %if ARCH_X86_64 - punpckldq m5, m6 - movq rax, m5 + movq rax, xm5 +%else + movd eax, xm5 +%if cpuflag(avx) + pextrd edx, xm5, 1 %else - movd eax, m5 - movd edx, m6 + pshuflw xm5, xm5, q1032 + movd edx, xm5 +%endif %endif RET %endmacro @@ -739,61 +748,25 @@ SSD_NV12 paddd m6, m4 %endmacro -%macro VAR_2ROW 2 - mov r2d, %2 -.loop: -%if HIGH_BIT_DEPTH - mova m0, [r0] - mova m1, [r0+mmsize] - mova m3, [r0+%1] - mova m4, [r0+%1+mmsize] -%else ; !HIGH_BIT_DEPTH - mova m0, [r0] - mova m3, [r0+%1] - punpckhbw m1, m0, m7 - punpcklbw m0, m7 - punpckhbw m4, m3, m7 - punpcklbw m3, m7 -%endif ; HIGH_BIT_DEPTH -%ifidn %1, r1 - lea r0, [r0+%1*2] -%else - add r0, r1 -%endif - VAR_CORE - dec r2d - jg .loop -%endmacro - ;----------------------------------------------------------------------------- ; int pixel_var_wxh( uint8_t *, intptr_t ) ;----------------------------------------------------------------------------- -INIT_MMX mmx2 -cglobal pixel_var_16x16, 2,3 - FIX_STRIDES r1 - VAR_START 0 - VAR_2ROW 8*SIZEOF_PIXEL, 16 - VAR_END 16, 16 - -cglobal pixel_var_8x16, 2,3 - FIX_STRIDES r1 - VAR_START 0 - VAR_2ROW r1, 8 - VAR_END 8, 16 - -cglobal pixel_var_8x8, 2,3 - FIX_STRIDES r1 - VAR_START 0 - VAR_2ROW r1, 4 - VAR_END 8, 8 - %if HIGH_BIT_DEPTH %macro VAR 0 cglobal pixel_var_16x16, 2,3,8 FIX_STRIDES r1 VAR_START 0 - VAR_2ROW r1, 8 - VAR_END 16, 16 + mov r2d, 8 +.loop: + mova m0, [r0] + mova m1, [r0+mmsize] + mova m3, [r0+r1] + mova m4, [r0+r1+mmsize] + lea r0, [r0+r1*2] + VAR_CORE + dec r2d + jg .loop + VAR_END cglobal pixel_var_8x8, 2,3,8 lea r2, [r1*3] @@ -809,18 +782,16 @@ cglobal pixel_var_8x8, 2,3,8 mova m3, [r0+r1*4] mova m4, [r0+r2*2] VAR_CORE - VAR_END 8, 8 + VAR_END %endmacro ; VAR INIT_XMM sse2 VAR INIT_XMM avx VAR -INIT_XMM xop -VAR -%endif ; HIGH_BIT_DEPTH -%if HIGH_BIT_DEPTH == 0 +%else ; HIGH_BIT_DEPTH == 0 + %macro VAR 0 cglobal pixel_var_16x16, 2,3,8 VAR_START 1 @@ -833,7 +804,7 @@ cglobal pixel_var_16x16, 2,3,8 VAR_CORE dec r2d jg .loop - VAR_END 16, 16 + VAR_END cglobal pixel_var_8x8, 2,4,8 VAR_START 1 @@ -849,7 +820,7 @@ cglobal pixel_var_8x8, 2,4,8 VAR_CORE dec r2d jg .loop - VAR_END 8, 8 + VAR_END cglobal pixel_var_8x16, 2,4,8 VAR_START 1 @@ -865,15 +836,13 @@ cglobal pixel_var_8x16, 2,4,8 VAR_CORE dec r2d jg .loop - VAR_END 8, 16 + VAR_END %endmacro ; VAR INIT_XMM sse2 VAR INIT_XMM avx VAR -INIT_XMM xop -VAR %endif ; !HIGH_BIT_DEPTH INIT_YMM avx2 @@ -898,21 +867,120 @@ cglobal pixel_var_16x16, 2,4,7 VAR_CORE dec r2d jg .loop - vextracti128 xm0, m5, 1 - vextracti128 xm1, m6, 1 - paddw xm5, xm0 - paddd xm6, xm1 - HADDW xm5, xm2 - HADDD xm6, xm1 + VAR_END + +%macro VAR_AVX512_CORE 1 ; accum +%if %1 + paddw m0, m2 + pmaddwd m2, m2 + paddw m0, m3 + pmaddwd m3, m3 + paddd m1, m2 + paddd m1, m3 +%else + paddw m0, m2, m3 + pmaddwd m2, m2 + pmaddwd m3, m3 + paddd m1, m2, m3 +%endif +%endmacro + +%macro VAR_AVX512_CORE_16x16 1 ; accum +%if HIGH_BIT_DEPTH + mova ym2, [r0] + vinserti64x4 m2, [r0+r1], 1 + mova ym3, [r0+2*r1] + vinserti64x4 m3, [r0+r3], 1 +%else + vbroadcasti64x2 ym2, [r0] + vbroadcasti64x2 m2 {k1}, [r0+r1] + vbroadcasti64x2 ym3, [r0+2*r1] + vbroadcasti64x2 m3 {k1}, [r0+r3] + pshufb m2, m4 + pshufb m3, m4 +%endif + VAR_AVX512_CORE %1 +%endmacro + +%macro VAR_AVX512_CORE_8x8 1 ; accum +%if HIGH_BIT_DEPTH + mova xm2, [r0] + mova xm3, [r0+r1] +%else + movq xm2, [r0] + movq xm3, [r0+r1] +%endif + vinserti128 ym2, [r0+2*r1], 1 + vinserti128 ym3, [r0+r2], 1 + lea r0, [r0+4*r1] + vinserti32x4 m2, [r0], 2 + vinserti32x4 m3, [r0+r1], 2 + vinserti32x4 m2, [r0+2*r1], 3 + vinserti32x4 m3, [r0+r2], 3 +%if HIGH_BIT_DEPTH == 0 + punpcklbw m2, m4 + punpcklbw m3, m4 +%endif + VAR_AVX512_CORE %1 +%endmacro + +INIT_ZMM avx512 +cglobal pixel_var_16x16, 2,4 + FIX_STRIDES r1 + mov r2d, 0xf0 + lea r3, [3*r1] +%if HIGH_BIT_DEPTH == 0 + vbroadcasti64x4 m4, [var_shuf_avx512] + kmovb k1, r2d +%endif + VAR_AVX512_CORE_16x16 0 +.loop: + lea r0, [r0+4*r1] + VAR_AVX512_CORE_16x16 1 + sub r2d, 0x50 + jg .loop +%if ARCH_X86_64 == 0 + pop r3d + %assign regs_used 3 +%endif +var_avx512_end: + vbroadcasti32x4 m2, [pw_1] + pmaddwd m0, m2 + SBUTTERFLY dq, 0, 1, 2 + paddd m0, m1 + vextracti32x8 ym1, m0, 1 + paddd ym0, ym1 + vextracti128 xm1, ym0, 1 + paddd xmm0, xm0, xm1 + punpckhqdq xmm1, xmm0, xmm0 + paddd xmm0, xmm1 %if ARCH_X86_64 - punpckldq xm5, xm6 - movq rax, xm5 + movq rax, xmm0 %else - movd eax, xm5 - movd edx, xm6 + movd eax, xmm0 + pextrd edx, xmm0, 1 %endif RET +%if HIGH_BIT_DEPTH == 0 ; 8x8 doesn't benefit from AVX-512 in high bit-depth +cglobal pixel_var_8x8, 2,3 + lea r2, [3*r1] + pxor xm4, xm4 + VAR_AVX512_CORE_8x8 0 + jmp var_avx512_end +%endif + +cglobal pixel_var_8x16, 2,3 + FIX_STRIDES r1 + lea r2, [3*r1] +%if HIGH_BIT_DEPTH == 0 + pxor xm4, xm4 +%endif + VAR_AVX512_CORE_8x8 0 + lea r0, [r0+4*r1] + VAR_AVX512_CORE_8x8 1 + jmp var_avx512_end + %macro VAR2_END 3 HADDW %2, xm1 movd r1d, %2 diff --git a/common/x86/pixel.h b/common/x86/pixel.h index d7753f53..10716d49 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -94,11 +94,10 @@ DECL_X4( sad, cache64_mmx2 ); DECL_X4( sad, cache64_sse2 ); DECL_X4( sad, cache64_ssse3 ); -DECL_PIXELS( uint64_t, var, mmx2, ( pixel *pix, intptr_t i_stride )) -DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride )) -DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride )) -DECL_PIXELS( uint64_t, var, xop, ( pixel *pix, intptr_t i_stride )) -DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, sse2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, avx, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, avx2, ( pixel *pix, intptr_t i_stride )) +DECL_PIXELS( uint64_t, var, avx512, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, mmx2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( pixel *pix, intptr_t i_stride )) DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( pixel *pix, intptr_t i_stride )) -- 2.40.0