From bc88d1bb331ee061c38bea80f7a54a76797c31d0 Mon Sep 17 00:00:00 2001 From: Fiona Glaser Date: Thu, 9 May 2013 17:20:05 -0700 Subject: [PATCH] x86-64: 64-bit variant of AVX2 hpel_filter ~5% faster than 32-bit. --- common/x86/mc-a2.asm | 64 +++++++++++++++++++++++++++++------------- common/x86/mc-c.c | 3 +- common/x86/x86util.asm | 13 ++++++++- 3 files changed, 59 insertions(+), 21 deletions(-) diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm index 8f0a0bd6..893c0655 100644 --- a/common/x86/mc-a2.asm +++ b/common/x86/mc-a2.asm @@ -37,6 +37,7 @@ filt_mul15: times 16 db 1, -5 filt_mul51: times 16 db -5, 1 hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15 deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15 + %if HIGH_BIT_DEPTH deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14 deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15 @@ -642,7 +643,6 @@ INIT_XMM avx HPEL_C HPEL_V 0 HPEL_H -%endif INIT_YMM avx2 HPEL_V 8 HPEL_C @@ -682,15 +682,16 @@ cglobal hpel_filter_h, 3,3,8 add r2, mmsize jl .loop RET +%endif %if ARCH_X86_64 %macro DO_FILT_V 5 ;The optimum prefetch distance is difficult to determine in checkasm: ;any prefetch seems slower than not prefetching. ;In real use, the prefetch seems to be a slight win. - ;+16 is picked somewhat arbitrarily here based on the fact that even one + ;+mmsize is picked somewhat arbitrarily here based on the fact that even one ;loop iteration is going to take longer than the prefetch. - prefetcht0 [r1+r2*2+16] + prefetcht0 [r1+r2*2+mmsize] %if cpuflag(ssse3) mova m1, [r3] mova m2, [r3+r2] @@ -723,31 +724,48 @@ cglobal hpel_filter_h, 3,3,8 packuswb %3, %4 FILT_V2 m1, m2, m3, m4, m5, m6 %endif - add r3, 16 - add r1, 16 + add r3, mmsize + add r1, mmsize +%if mmsize==32 + vinserti128 %1, m1, xm4, 1 + vperm2i128 %2, m1, m4, q0301 +%else mova %1, m1 mova %2, m4 +%endif FILT_PACK m1, m4, m15, 5 movntps [r8+r4+%5], m1 %endmacro -%macro FILT_C 4 - PALIGNR m1, %2, %1, 12, m2 - PALIGNR m2, %2, %1, 14, %1 +%macro FILT_C 3 +%if mmsize==32 + vperm2i128 m3, %2, %1, q0003 +%endif + PALIGNR m1, %2, %1, (mmsize-4), m3 + PALIGNR m2, %2, %1, (mmsize-2), m3 +%if mmsize==32 + vperm2i128 %1, %3, %2, q0003 +%endif PALIGNR m3, %3, %2, 4, %1 PALIGNR m4, %3, %2, 2, %1 paddw m3, m2 +%if mmsize==32 + mova m2, %1 +%endif mova %1, %3 - PALIGNR %3, %2, 6, m2 + PALIGNR %3, %3, %2, 6, m2 paddw m4, %2 paddw %3, m1 FILT_H %3, m3, m4 %endmacro %macro DO_FILT_C 4 - FILT_C %1, %2, %3, 6 - FILT_C %2, %1, %4, 6 + FILT_C %1, %2, %3 + FILT_C %2, %1, %4 FILT_PACK %3, %4, m15, 6 +%if mmsize==32 + vpermq %3, %3, q3120 +%endif movntps [r5+r4], %3 %endmacro @@ -761,8 +779,14 @@ cglobal hpel_filter_h, 3,3,8 %endmacro %macro DO_FILT_H 3 - PALIGNR m1, %2, %1, 14, m3 - PALIGNR m2, %2, %1, 15, m3 +%if mmsize==32 + vperm2i128 m3, %2, %1, q0003 +%endif + PALIGNR m1, %2, %1, (mmsize-2), m3 + PALIGNR m2, %2, %1, (mmsize-1), m3 +%if mmsize==32 + vperm2i128 m3, %3, %2, q0003 +%endif PALIGNR m4, %3, %2, 1 , m3 PALIGNR m5, %3, %2, 2 , m3 PALIGNR m6, %3, %2, 3 , m3 @@ -798,9 +822,9 @@ cglobal hpel_filter_h, 3,3,8 ;----------------------------------------------------------------------------- cglobal hpel_filter, 7,9,16 mov r7, r3 - sub r5d, 16 + sub r5d, mmsize mov r8, r1 - and r7, 15 + and r7, mmsize-1 sub r3, r7 add r0, r5 add r8, r5 @@ -827,7 +851,7 @@ cglobal hpel_filter, 7,9,16 DO_FILT_V m8, m7, m13, m12, 0 ;ALIGN 16 .loopx: - DO_FILT_V m6, m5, m11, m12, 16 + DO_FILT_V m6, m5, m11, m12, mmsize .lastx: %if cpuflag(ssse3) psrlw m15, 1 ; pw_512 @@ -840,11 +864,11 @@ cglobal hpel_filter, 7,9,16 %else psrlw m15, 1 ; pw_16 %endif - movdqa m7, m5 + mova m7, m5 DO_FILT_H m10, m13, m11 - add r4, 16 + add r4, mmsize jl .loopx - cmp r4, 16 + cmp r4, mmsize jl .lastx ; setup regs for next y sub r4, r7 @@ -867,6 +891,8 @@ INIT_XMM ssse3 HPEL INIT_XMM avx HPEL +INIT_YMM avx2 +HPEL %endif ; ARCH_X86_64 %undef movntq diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c index cfdc91fd..d83958de 100644 --- a/common/x86/mc-c.c +++ b/common/x86/mc-c.c @@ -470,12 +470,13 @@ HPEL(16, sse2_amd, mmx2, mmx2, sse2) void x264_hpel_filter_sse2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); void x264_hpel_filter_ssse3( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); void x264_hpel_filter_avx ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); +void x264_hpel_filter_avx2 ( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, intptr_t stride, int width, int height, int16_t *buf ); #else HPEL(16, sse2, sse2, sse2, sse2) HPEL(16, ssse3, ssse3, ssse3, ssse3) HPEL(16, avx, avx, avx, avx) -#endif HPEL(32, avx2, avx2, avx2, avx2) +#endif HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2) #endif // HIGH_BIT_DEPTH diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm index a77016cf..8a5365e0 100644 --- a/common/x86/x86util.asm +++ b/common/x86/x86util.asm @@ -341,7 +341,18 @@ %endmacro %macro PALIGNR 4-5 ; [dst,] src1, src2, imm, tmp -%if cpuflag(ssse3) +; AVX2 version uses a precalculated extra input that +; can be re-used across calls +%if sizeof%1==32 + ; %3 = abcdefgh ijklmnop (lower address) + ; %2 = ABCDEFGH IJKLMNOP (higher address) +; vperm2i128 %5, %2, %3, q0003 ; %5 = ijklmnop ABCDEFGH +%if %4 < 16 + palignr %1, %5, %3, %4 ; %1 = bcdefghi jklmnopA +%else + palignr %1, %2, %5, %4-16 ; %1 = pABCDEFG HIJKLMNO +%endif +%elif cpuflag(ssse3) %if %0==5 palignr %1, %2, %3, %4 %else -- 2.40.0