From: Henrik Gramner Date: Fri, 28 Apr 2017 19:35:25 +0000 (+0200) Subject: x86: AVX-512 pixel_sa8d_8x8 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1cf7baa462ca52de7f07d6e4c795853900bb50bb;p=libx264 x86: AVX-512 pixel_sa8d_8x8 --- diff --git a/common/pixel.c b/common/pixel.c index b68bb4c2..c33a873f 100644 --- a/common/pixel.c +++ b/common/pixel.c @@ -1355,6 +1355,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) if( cpu&X264_CPU_AVX512 ) { INIT8( satd, _avx512 ); + pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512; } #endif //HAVE_MMX diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm index 9c6ed6c8..9b3dc27b 100644 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -4626,7 +4626,7 @@ cglobal intra_sad_x9_8x8, 5,7,8 HMAXABSW2 0, 1, 2, 3 %endmacro -%macro SATD_AVX512_END 0 +%macro SATD_AVX512_END 0-1 0 ; sa8d paddw m0 {k1}{z}, m1 ; zero-extend to dwords %if ARCH_X86_64 %if mmsize == 64 @@ -4641,10 +4641,19 @@ cglobal intra_sad_x9_8x8, 5,7,8 paddd xmm0, xmm1 movq rax, xmm0 rorx rdx, rax, 32 +%if %1 + lea eax, [rax+rdx+1] + shr eax, 1 +%else add eax, edx +%endif %else HADDD m0, m1 movd eax, xm0 +%if %1 + inc eax + shr eax, 1 +%endif %endif RET %endmacro @@ -4789,6 +4798,29 @@ cglobal pixel_satd_4x4, 4,5 SWAP 0, 1 SATD_AVX512_END +INIT_ZMM avx512 +cglobal pixel_sa8d_8x8, 4,6 + vbroadcasti64x4 m4, [hmul_16p] + mov r4d, 0x55555555 + kmovd k1, r4d ; 01010101 + kshiftlb k2, k1, 5 ; 10100000 + kshiftlb k3, k1, 4 ; 01010000 + lea r4, [3*r1] + lea r5, [3*r3] + SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4 + DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 ; 3 1 3 1 7 5 7 5 + SUMSUB_BA w, 0, 1, 2 + SBUTTERFLY qdq, 0, 1, 2 + SUMSUB_BA w, 0, 1, 2 + shufps m2, m0, m1, q2020 + shufps m1, m0, m1, q3131 + SUMSUB_BA w, 2, 1, 0 + vshufi32x4 m0, m2, m1, q1010 + vshufi32x4 m1, m2, m1, q3232 + SUMSUB_BA w, 0, 1, 2 + HMAXABSW2 0, 1, 2, 3 + SATD_AVX512_END 1 + %endif ; HIGH_BIT_DEPTH ;============================================================================= diff --git a/common/x86/pixel.h b/common/x86/pixel.h index 62b9fb42..d7753f53 100644 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -84,6 +84,7 @@ DECL_X1( sa8d, sse4 ) DECL_X1( sa8d, avx ) DECL_X1( sa8d, xop ) DECL_X1( sa8d, avx2 ) +DECL_X1( sa8d, avx512 ) DECL_X1( sad, cache32_mmx2 ); DECL_X1( sad, cache64_mmx2 ); DECL_X1( sad, cache64_sse2 );