From 8a9608bbbdf77ceb3ee537271549111468175a2b Mon Sep 17 00:00:00 2001 From: Henrik Gramner Date: Tue, 11 Dec 2012 16:05:34 +0100 Subject: [PATCH] x86inc: Use VEX-encoded instructions in AVX functions Automatically use VEX-encoding in AVX/AVX2/XOP/FMA3/FMA4 functions for all instructions that exists in a VEX-encoded version. This change makes it easier to extend existing code to use AVX2. Also add support for AVX emulation of a few instructions that were missing before. --- common/x86/dct-a.asm | 2 +- common/x86/deblock-a.asm | 22 ++-- common/x86/x86inc.asm | 252 ++++++++++++++++++++++++++------------- 3 files changed, 181 insertions(+), 95 deletions(-) diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm index 8ee94c20..c9924236 100644 --- a/common/x86/dct-a.asm +++ b/common/x86/dct-a.asm @@ -1274,7 +1274,7 @@ cglobal zigzag_scan_8x8_field, 2,3,8 mova m1, [r1+ 4*SIZEOF_DCTCOEF] ; 07 06 05 04 mova m2, [r1+ 8*SIZEOF_DCTCOEF] ; 11 10 09 08 pshuf%1 m3, m0, q3333 ; 03 03 03 03 - movd r2, m2 ; 09 08 + movd r2d, m2 ; 09 08 pshuf%1 m2, m2, q0321 ; 08 11 10 09 punpckl%2 m3, m1 ; 05 03 04 03 pinsr%1 m0, r2d, 3 ; 08 02 01 00 diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm index a8be80e7..726729d5 100644 --- a/common/x86/deblock-a.asm +++ b/common/x86/deblock-a.asm @@ -171,7 +171,7 @@ cglobal deblock_v_luma, 5,5,8 %define bm [rsp+mmsize*4] SUB rsp, pad add r1, r1 - LOAD_AB m4, m5, r2, r3 + LOAD_AB m4, m5, r2d, r3d mov r3, 32/mmsize mov r2, r0 sub r0, r1 @@ -227,7 +227,7 @@ cglobal deblock_h_luma, 5,6,8 %define bm [rsp+mmsize*6] SUB rsp, pad add r1, r1 - LOAD_AB m4, m5, r2, r3 + LOAD_AB m4, m5, r2d, r3d mov r3, r1 mova am, m4 add r3, r1 @@ -355,7 +355,7 @@ cglobal deblock_v_luma, 5,5,15 %define mask1 m10 %define mask2 m11 add r1, r1 - LOAD_AB m12, m13, r2, r3 + LOAD_AB m12, m13, r2d, r3d mov r2, r0 sub r0, r1 sub r0, r1 @@ -382,7 +382,7 @@ cglobal deblock_v_luma, 5,5,15 cglobal deblock_h_luma, 5,7,15 add r1, r1 - LOAD_AB m12, m13, r2, r3 + LOAD_AB m12, m13, r2d, r3d mov r2, r1 add r2, r1 add r2, r1 @@ -1216,7 +1216,7 @@ cglobal deblock_%1_luma, 5,5,8,2*%2 mova m1, [r4+2*r1] ; p0 mova m2, [r0] ; q0 mova m3, [r0+r1] ; q1 - LOAD_MASK r2, r3 + LOAD_MASK r2d, r3d mov r3, r4mp movd m4, [r3] ; tc0 @@ -1660,7 +1660,7 @@ DEBLOCK_LUMA_INTRA v8 %macro DEBLOCK_CHROMA 0 cglobal deblock_inter_body - LOAD_AB m4, m5, r2, r3 + LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 pxor m4, m4 LOAD_TC m6, r4 @@ -1710,7 +1710,7 @@ cglobal deblock_h_chroma, 5,7,8 cglobal deblock_intra_body - LOAD_AB m4, m5, r2, r3 + LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 ret @@ -1721,7 +1721,7 @@ cglobal deblock_intra_body cglobal deblock_v_chroma_intra, 4,6,8 add r1, r1 mov r5, 32/mmsize - movd m5, r3 + movd m5, r3d mov r4, r0 sub r0, r1 sub r0, r1 @@ -1766,7 +1766,7 @@ cglobal deblock_h_chroma_intra_mbaff, 4,6,8 lea r5, [r1*3] %endif CHROMA_H_LOAD r5 - LOAD_AB m4, m5, r2, r3 + LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 CHROMA_H_STORE r5 @@ -1788,7 +1788,7 @@ cglobal deblock_h_chroma_mbaff, 5,7,8 .loop: %endif CHROMA_H_LOAD r6 - LOAD_AB m4, m5, r2, r3 + LOAD_AB m4, m5, r2d, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 movd m6, [r4] punpcklbw m6, m6 @@ -1832,7 +1832,7 @@ cglobal deblock_h_chroma_422, 5,7,8 lea r6, [r1*3] .loop: CHROMA_H_LOAD r6 - LOAD_AB m4, m5, r2m, r3 + LOAD_AB m4, m5, r2m, r3d LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 pxor m4, m4 movd m6, [r4-1] diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm index 1b81ff54..2eb6c652 100644 --- a/common/x86/x86inc.asm +++ b/common/x86/x86inc.asm @@ -746,7 +746,7 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %if cpuflag(avx) %assign avx_enabled 1 %endif - %if mmsize == 16 && notcpuflag(sse2) + %if (mmsize == 16 && notcpuflag(sse2)) || (mmsize == 32 && notcpuflag(avx2)) %define mova movaps %define movu movups %define movnta movntps @@ -825,10 +825,10 @@ SECTION .note.GNU-stack noalloc noexec nowrite progbits %if ARCH_X86_64 %define num_mmregs 16 %endif - %define mova vmovaps - %define movu vmovups + %define mova movdqa + %define movu movdqu %undef movh - %define movnta vmovntps + %define movnta movntdq %assign %%i 0 %rep num_mmregs CAT_XDEFINE m, %%i, ymm %+ %%i @@ -989,101 +989,107 @@ INIT_XMM ;%1 == instruction ;%2 == 1 if float, 0 if int -;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) -;%4 == number of operands given +;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not ;%5+: operands -%macro RUN_AVX_INSTR 6-7+ - %ifid %6 - %define %%sizeofreg sizeof%6 - %elifid %5 - %define %%sizeofreg sizeof%5 +%macro RUN_AVX_INSTR 5-8+ + %ifnum sizeof%6 + %assign %%sizeofreg sizeof%6 + %elifnum sizeof%5 + %assign %%sizeofreg sizeof%5 %else - %define %%sizeofreg mmsize + %assign %%sizeofreg mmsize %endif - %if %%sizeofreg==32 - %if %4>=3 - v%1 %5, %6, %7 - %else - v%1 %5, %6 - %endif + %assign %%emulate_avx 0 + %if avx_enabled && %%sizeofreg >= 16 + %xdefine %%instr v%1 %else - %if %%sizeofreg==8 - %define %%regmov movq - %elif %2 - %define %%regmov movaps - %else - %define %%regmov movdqa + %xdefine %%instr %1 + %if %0 >= 7+%3 + %assign %%emulate_avx 1 %endif + %endif - %if %4>=3+%3 - %ifnidn %5, %6 - %if avx_enabled && %%sizeofreg==16 - v%1 %5, %6, %7 - %else - CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 - %%regmov %5, %6 - %1 %5, %7 + %if %%emulate_avx + %xdefine %%src1 %6 + %xdefine %%src2 %7 + %ifnidn %5, %6 + %if %0 >= 8 + CHECK_AVX_INSTR_EMU {%1 %5, %6, %7, %8}, %5, %7, %8 + %else + CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 + %endif + %if %4 && %3 == 0 + %ifnid %7 + ; 3-operand AVX instructions with a memory arg can only have it in src2, + ; whereas SSE emulation prefers to have it in src1 (i.e. the mov). + ; So, if the instruction is commutative with a memory arg, swap them. + %xdefine %%src1 %7 + %xdefine %%src2 %6 %endif + %endif + %if %%sizeofreg == 8 + MOVQ %5, %%src1 + %elif %2 + MOVAPS %5, %%src1 %else - %1 %5, %7 + MOVDQA %5, %%src1 %endif - %elif %4>=3 - %1 %5, %6, %7 - %else - %1 %5, %6 %endif - %endif -%endmacro - -; 3arg AVX ops with a memory arg can only have it in src2, -; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). -; So, if the op is symmetric and the wrong one is memory, swap them. -%macro RUN_AVX_INSTR1 8 - %assign %%swap 0 - %if avx_enabled - %ifnid %6 - %assign %%swap 1 - %endif - %elifnidn %5, %6 - %ifnid %7 - %assign %%swap 1 + %if %0 >= 8 + %1 %5, %%src2, %8 + %else + %1 %5, %%src2 %endif - %endif - %if %%swap && %3 == 0 && %8 == 1 - RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 + %elif %0 >= 8 + %%instr %5, %6, %7, %8 + %elif %0 == 7 + %%instr %5, %6, %7 + %elif %0 == 6 + %%instr %5, %6 %else - RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 + %%instr %5 %endif %endmacro ;%1 == instruction ;%2 == 1 if float, 0 if int -;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) -;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not -%macro AVX_INSTR 4 - %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 - %ifidn %3, fnord - RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 +;%3 == 1 if non-destructive or 4-operand (xmm, xmm, xmm, imm), 0 otherwise +;%4 == 1 if commutative (i.e. doesn't matter which src arg is which), 0 if not +%macro AVX_INSTR 1-4 0, 1, 0 + %macro %1 1-9 fnord, fnord, fnord, fnord, %1, %2, %3, %4 + %ifidn %2, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %1 + %elifidn %3, fnord + RUN_AVX_INSTR %6, %7, %8, %9, %1, %2 %elifidn %4, fnord - RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 + RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3 %elifidn %5, fnord - RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 + RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4 %else - RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 + RUN_AVX_INSTR %6, %7, %8, %9, %1, %2, %3, %4, %5 %endif %endmacro %endmacro +; Instructions with both VEX and non-VEX encodings +; Non-destructive instructions are written without parameters AVX_INSTR addpd, 1, 0, 1 AVX_INSTR addps, 1, 0, 1 AVX_INSTR addsd, 1, 0, 1 AVX_INSTR addss, 1, 0, 1 AVX_INSTR addsubpd, 1, 0, 0 AVX_INSTR addsubps, 1, 0, 0 -AVX_INSTR andpd, 1, 0, 1 -AVX_INSTR andps, 1, 0, 1 +AVX_INSTR aesdec, 0, 0, 0 +AVX_INSTR aesdeclast, 0, 0, 0 +AVX_INSTR aesenc, 0, 0, 0 +AVX_INSTR aesenclast, 0, 0, 0 +AVX_INSTR aesimc +AVX_INSTR aeskeygenassist AVX_INSTR andnpd, 1, 0, 0 AVX_INSTR andnps, 1, 0, 0 +AVX_INSTR andpd, 1, 0, 1 +AVX_INSTR andps, 1, 0, 1 AVX_INSTR blendpd, 1, 0, 0 AVX_INSTR blendps, 1, 0, 0 AVX_INSTR blendvpd, 1, 0, 0 @@ -1092,18 +1098,39 @@ AVX_INSTR cmppd, 1, 0, 0 AVX_INSTR cmpps, 1, 0, 0 AVX_INSTR cmpsd, 1, 0, 0 AVX_INSTR cmpss, 1, 0, 0 -AVX_INSTR cvtdq2ps, 1, 0, 0 -AVX_INSTR cvtps2dq, 1, 0, 0 +AVX_INSTR comisd +AVX_INSTR comiss +AVX_INSTR cvtdq2pd +AVX_INSTR cvtdq2ps +AVX_INSTR cvtpd2dq +AVX_INSTR cvtpd2ps +AVX_INSTR cvtps2dq +AVX_INSTR cvtps2pd +AVX_INSTR cvtsd2si +AVX_INSTR cvtsd2ss +AVX_INSTR cvtsi2sd +AVX_INSTR cvtsi2ss +AVX_INSTR cvtss2sd +AVX_INSTR cvtss2si +AVX_INSTR cvttpd2dq +AVX_INSTR cvttps2dq +AVX_INSTR cvttsd2si +AVX_INSTR cvttss2si AVX_INSTR divpd, 1, 0, 0 AVX_INSTR divps, 1, 0, 0 AVX_INSTR divsd, 1, 0, 0 AVX_INSTR divss, 1, 0, 0 AVX_INSTR dppd, 1, 1, 0 AVX_INSTR dpps, 1, 1, 0 +AVX_INSTR extractps AVX_INSTR haddpd, 1, 0, 0 AVX_INSTR haddps, 1, 0, 0 AVX_INSTR hsubpd, 1, 0, 0 AVX_INSTR hsubps, 1, 0, 0 +AVX_INSTR insertps, 1, 1, 0 +AVX_INSTR lddqu +AVX_INSTR ldmxcsr +AVX_INSTR maskmovdqu AVX_INSTR maxpd, 1, 0, 1 AVX_INSTR maxps, 1, 0, 1 AVX_INSTR maxsd, 1, 0, 1 @@ -1112,10 +1139,31 @@ AVX_INSTR minpd, 1, 0, 1 AVX_INSTR minps, 1, 0, 1 AVX_INSTR minsd, 1, 0, 1 AVX_INSTR minss, 1, 0, 1 +AVX_INSTR movapd +AVX_INSTR movaps +AVX_INSTR movd +AVX_INSTR movddup +AVX_INSTR movdqa +AVX_INSTR movdqu AVX_INSTR movhlps, 1, 0, 0 +AVX_INSTR movhpd, 1, 0, 0 +AVX_INSTR movhps, 1, 0, 0 AVX_INSTR movlhps, 1, 0, 0 +AVX_INSTR movlpd, 1, 0, 0 +AVX_INSTR movlps, 1, 0, 0 +AVX_INSTR movmskpd +AVX_INSTR movmskps +AVX_INSTR movntdq +AVX_INSTR movntdqa +AVX_INSTR movntpd +AVX_INSTR movntps +AVX_INSTR movq AVX_INSTR movsd, 1, 0, 0 +AVX_INSTR movshdup +AVX_INSTR movsldup AVX_INSTR movss, 1, 0, 0 +AVX_INSTR movupd +AVX_INSTR movups AVX_INSTR mpsadbw, 0, 1, 0 AVX_INSTR mulpd, 1, 0, 1 AVX_INSTR mulps, 1, 0, 1 @@ -1123,9 +1171,9 @@ AVX_INSTR mulsd, 1, 0, 1 AVX_INSTR mulss, 1, 0, 1 AVX_INSTR orpd, 1, 0, 1 AVX_INSTR orps, 1, 0, 1 -AVX_INSTR pabsb, 0, 0, 0 -AVX_INSTR pabsw, 0, 0, 0 -AVX_INSTR pabsd, 0, 0, 0 +AVX_INSTR pabsb +AVX_INSTR pabsd +AVX_INSTR pabsw AVX_INSTR packsswb, 0, 0, 0 AVX_INSTR packssdw, 0, 0, 0 AVX_INSTR packuswb, 0, 0, 0 @@ -1145,10 +1193,11 @@ AVX_INSTR pavgb, 0, 0, 1 AVX_INSTR pavgw, 0, 0, 1 AVX_INSTR pblendvb, 0, 0, 0 AVX_INSTR pblendw, 0, 1, 0 -AVX_INSTR pcmpestri, 0, 0, 0 -AVX_INSTR pcmpestrm, 0, 0, 0 -AVX_INSTR pcmpistri, 0, 0, 0 -AVX_INSTR pcmpistrm, 0, 0, 0 +AVX_INSTR pclmulqdq, 0, 1, 0 +AVX_INSTR pcmpestri +AVX_INSTR pcmpestrm +AVX_INSTR pcmpistri +AVX_INSTR pcmpistrm AVX_INSTR pcmpeqb, 0, 0, 1 AVX_INSTR pcmpeqw, 0, 0, 1 AVX_INSTR pcmpeqd, 0, 0, 1 @@ -1157,12 +1206,21 @@ AVX_INSTR pcmpgtb, 0, 0, 0 AVX_INSTR pcmpgtw, 0, 0, 0 AVX_INSTR pcmpgtd, 0, 0, 0 AVX_INSTR pcmpgtq, 0, 0, 0 +AVX_INSTR pextrb +AVX_INSTR pextrd +AVX_INSTR pextrq +AVX_INSTR pextrw AVX_INSTR phaddw, 0, 0, 0 AVX_INSTR phaddd, 0, 0, 0 AVX_INSTR phaddsw, 0, 0, 0 +AVX_INSTR phminposuw AVX_INSTR phsubw, 0, 0, 0 AVX_INSTR phsubd, 0, 0, 0 AVX_INSTR phsubsw, 0, 0, 0 +AVX_INSTR pinsrb, 0, 1, 0 +AVX_INSTR pinsrd, 0, 1, 0 +AVX_INSTR pinsrq, 0, 1, 0 +AVX_INSTR pinsrw, 0, 1, 0 AVX_INSTR pmaddwd, 0, 0, 1 AVX_INSTR pmaddubsw, 0, 0, 0 AVX_INSTR pmaxsb, 0, 0, 1 @@ -1177,20 +1235,32 @@ AVX_INSTR pminsd, 0, 0, 1 AVX_INSTR pminub, 0, 0, 1 AVX_INSTR pminuw, 0, 0, 1 AVX_INSTR pminud, 0, 0, 1 -AVX_INSTR pmovmskb, 0, 0, 0 -AVX_INSTR pmulhuw, 0, 0, 1 +AVX_INSTR pmovmskb +AVX_INSTR pmovsxbw +AVX_INSTR pmovsxbd +AVX_INSTR pmovsxbq +AVX_INSTR pmovsxwd +AVX_INSTR pmovsxwq +AVX_INSTR pmovsxdq +AVX_INSTR pmovzxbw +AVX_INSTR pmovzxbd +AVX_INSTR pmovzxbq +AVX_INSTR pmovzxwd +AVX_INSTR pmovzxwq +AVX_INSTR pmovzxdq +AVX_INSTR pmuldq, 0, 0, 1 AVX_INSTR pmulhrsw, 0, 0, 1 +AVX_INSTR pmulhuw, 0, 0, 1 AVX_INSTR pmulhw, 0, 0, 1 AVX_INSTR pmullw, 0, 0, 1 AVX_INSTR pmulld, 0, 0, 1 AVX_INSTR pmuludq, 0, 0, 1 -AVX_INSTR pmuldq, 0, 0, 1 AVX_INSTR por, 0, 0, 1 AVX_INSTR psadbw, 0, 0, 1 AVX_INSTR pshufb, 0, 0, 0 -AVX_INSTR pshufd, 0, 1, 0 -AVX_INSTR pshufhw, 0, 1, 0 -AVX_INSTR pshuflw, 0, 1, 0 +AVX_INSTR pshufd +AVX_INSTR pshufhw +AVX_INSTR pshuflw AVX_INSTR psignb, 0, 0, 0 AVX_INSTR psignw, 0, 0, 0 AVX_INSTR psignd, 0, 0, 0 @@ -1212,7 +1282,7 @@ AVX_INSTR psubsb, 0, 0, 0 AVX_INSTR psubsw, 0, 0, 0 AVX_INSTR psubusb, 0, 0, 0 AVX_INSTR psubusw, 0, 0, 0 -AVX_INSTR ptest, 0, 0, 0 +AVX_INSTR ptest AVX_INSTR punpckhbw, 0, 0, 0 AVX_INSTR punpckhwd, 0, 0, 0 AVX_INSTR punpckhdq, 0, 0, 0 @@ -1222,11 +1292,27 @@ AVX_INSTR punpcklwd, 0, 0, 0 AVX_INSTR punpckldq, 0, 0, 0 AVX_INSTR punpcklqdq, 0, 0, 0 AVX_INSTR pxor, 0, 0, 1 +AVX_INSTR rcpps, 1, 0, 0 +AVX_INSTR rcpss, 1, 0, 0 +AVX_INSTR roundpd +AVX_INSTR roundps +AVX_INSTR roundsd +AVX_INSTR roundss +AVX_INSTR rsqrtps, 1, 0, 0 +AVX_INSTR rsqrtss, 1, 0, 0 +AVX_INSTR shufpd, 1, 1, 0 AVX_INSTR shufps, 1, 1, 0 +AVX_INSTR sqrtpd, 1, 0, 0 +AVX_INSTR sqrtps, 1, 0, 0 +AVX_INSTR sqrtsd, 1, 0, 0 +AVX_INSTR sqrtss, 1, 0, 0 +AVX_INSTR stmxcsr AVX_INSTR subpd, 1, 0, 0 AVX_INSTR subps, 1, 0, 0 AVX_INSTR subsd, 1, 0, 0 AVX_INSTR subss, 1, 0, 0 +AVX_INSTR ucomisd +AVX_INSTR ucomiss AVX_INSTR unpckhpd, 1, 0, 0 AVX_INSTR unpckhps, 1, 0, 0 AVX_INSTR unpcklpd, 1, 0, 0 -- 2.40.0