From: Craig Topper Date: Mon, 9 Jan 2017 02:44:34 +0000 (+0000) Subject: [AVX-512] Add patterns to use a zero masked VPTERNLOG instruction for vselects of... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3612268cf23761100136e56e03a7a2205f3da56e;p=llvm [AVX-512] Add patterns to use a zero masked VPTERNLOG instruction for vselects of all ones and all zeros. Previously we emitted a VPTERNLOG and a separate masked move. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@291415 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 774a904a826..75a265bf8a1 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -443,6 +443,22 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "", [(set VR512:$dst, (v16i32 immAllOnesV))]>; } +// Alias instructions that allow VPTERNLOG to be used with a mask to create +// a mix of all ones and all zeros elements. This is done this way to force +// the same register to be used as input for all three sources. +let isPseudo = 1, Predicates = [HasAVX512] in { +def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst), + (ins VK16WM:$mask), "", + [(set VR512:$dst, (vselect (v16i1 VK16WM:$mask), + (v16i32 immAllOnesV), + (v16i32 immAllZerosV)))]>; +def AVX512_512_SEXT_MASK_64 : I<0, Pseudo, (outs VR512:$dst), + (ins VK8WM:$mask), "", + [(set VR512:$dst, (vselect (v8i1 VK8WM:$mask), + (bc_v8i64 (v16i32 immAllOnesV)), + (bc_v8i64 (v16i32 immAllZerosV))))]>; +} + let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, Predicates = [HasVLX], SchedRW = [WriteZero] in { def AVX512_128_SET0 : I<0, Pseudo, (outs VR128X:$dst), (ins), "", diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 189c6f89be3..e3484d062bc 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -6864,6 +6864,21 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { .addReg(Reg, RegState::Undef).addImm(0xff); return true; } + case X86::AVX512_512_SEXT_MASK_32: + case X86::AVX512_512_SEXT_MASK_64: { + unsigned Reg = MIB->getOperand(0).getReg(); + unsigned MaskReg = MIB->getOperand(1).getReg(); + unsigned MaskState = getRegState(MIB->getOperand(1)); + unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ? + X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz; + MI.RemoveOperand(1); + MIB->setDesc(get(Opc)); + // VPTERNLOG needs 3 register inputs and an immediate. + // 0xff will return 1s for any input. + MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState) + .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff); + return true; + } case X86::VMOVAPSZ128rm_NOVLX: return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm), get(X86::VBROADCASTF32X4rm), X86::sub_xmm); diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index 532678ae72f..1a91bc1dee9 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -25,8 +25,7 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) { ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -48,8 +47,7 @@ define <16 x i1> @test2(<16 x i1>%a, <16 x i1>%b) { ; KNL_X32-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL_X32-NEXT: vptestmd %zmm0, %zmm0, %k1 ; KNL_X32-NEXT: vptestmd %zmm1, %zmm1, %k1 {%k1} -; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0 ; KNL_X32-NEXT: retl %c = and <16 x i1>%a, %b @@ -65,8 +63,7 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { ; KNL-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -88,8 +85,7 @@ define <8 x i1> @test3(<8 x i1>%a, <8 x i1>%b) { ; KNL_X32-NEXT: vpsllq $63, %zmm0, %zmm0 ; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL_X32-NEXT: vptestmq %zmm1, %zmm1, %k1 {%k1} -; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0 ; KNL_X32-NEXT: retl %c = and <8 x i1>%a, %b @@ -180,8 +176,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) { ; KNL-NEXT: Lcfi1: ; KNL-NEXT: .cfi_def_cfa_offset 16 ; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: callq _func16xi1 ; KNL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero @@ -210,8 +205,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) { ; KNL_X32-NEXT: Lcfi1: ; KNL_X32-NEXT: .cfi_def_cfa_offset 16 ; KNL_X32-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL_X32-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovdb %zmm0, %xmm0 ; KNL_X32-NEXT: calll _func16xi1 ; KNL_X32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero @@ -285,8 +279,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL-NEXT: movb $85, %al ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: popq %rax ; KNL-NEXT: retq @@ -322,8 +315,7 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) { ; KNL_X32-NEXT: movb $85, %al ; KNL_X32-NEXT: kmovw %eax, %k1 ; KNL_X32-NEXT: vptestmq %zmm0, %zmm0, %k1 {%k1} -; KNL_X32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL_X32-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL_X32-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL_X32-NEXT: vpmovqw %zmm0, %xmm0 ; KNL_X32-NEXT: addl $12, %esp ; KNL_X32-NEXT: retl diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll index c2eb19d1665..5e50a3aef2f 100644 --- a/test/CodeGen/X86/avx512-cvt.ll +++ b/test/CodeGen/X86/avx512-cvt.ll @@ -740,8 +740,7 @@ define <16 x float> @sitofp_16i1_float(<16 x i32> %a) { ; KNL: ## BB#0: ; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vcvtdq2ps %zmm0, %zmm0 ; KNL-NEXT: retq ; @@ -805,11 +804,10 @@ define <16 x double> @sitofp_16i1_double(<16 x double> %a) { ; KNL-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; KNL-NEXT: vcmpltpd %zmm1, %zmm2, %k1 ; KNL-NEXT: vcmpltpd %zmm0, %zmm2, %k2 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm1, %ymm1 ; KNL-NEXT: vcvtdq2pd %ymm1, %zmm1 ; KNL-NEXT: retq @@ -834,8 +832,7 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) { ; KNL: ## BB#0: ; KNL-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; KNL-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 ; KNL-NEXT: retq @@ -858,8 +855,7 @@ define <8 x float> @sitofp_8i1_float(<8 x float> %a) { ; KNL-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 ; KNL-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; KNL-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: vcvtdq2ps %ymm0, %ymm0 ; KNL-NEXT: retq diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll index 32bd0804d63..358225b6fe0 100644 --- a/test/CodeGen/X86/avx512-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -1457,8 +1457,7 @@ define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind { ; KNL-LABEL: sext_16i1_16i32: ; KNL: ## BB#0: ; KNL-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: sext_16i1_16i32: diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index 26d14fa0840..cb8ed0e59a3 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -365,11 +365,10 @@ define i16 @test16(i1 *%addr, i16 %a) { ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z} -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15] -; KNL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 ; KNL-NEXT: vpslld $31, %zmm2, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -402,11 +401,10 @@ define i8 @test17(i1 *%addr, i8 %a) { ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7] -; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax @@ -1242,30 +1240,29 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; KNL-NEXT: vpextrd $1, %xmm0, %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vmovd %xmm0, %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k2} {z} -; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,8,2,3,4,5,6,7] -; KNL-NEXT: vpermi2q %zmm2, %zmm3, %zmm4 -; KNL-NEXT: vpsllq $63, %zmm4, %zmm2 -; KNL-NEXT: vptestmq %zmm2, %zmm2, %k2 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2} {z} -; KNL-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1} {z} -; KNL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,5,6,7] -; KNL-NEXT: vpermi2q %zmm3, %zmm2, %zmm4 -; KNL-NEXT: vpsllq $63, %zmm4, %zmm2 -; KNL-NEXT: vptestmq %zmm2, %zmm2, %k1 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,8,2,3,4,5,6,7] +; KNL-NEXT: vpermi2q %zmm1, %zmm2, %zmm3 +; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k2 +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,8,3,4,5,6,7] +; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 +; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 +; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpextrd $3, %xmm0, %eax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; KNL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,2,8,4,5,6,7] -; KNL-NEXT: vpermi2q %zmm0, %zmm2, %zmm1 -; KNL-NEXT: vpsllq $63, %zmm1, %zmm0 +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,8,4,5,6,7] +; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: retq @@ -1306,11 +1303,10 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y) ; KNL-NEXT: vmovq %xmm0, %rax ; KNL-NEXT: andl $1, %eax ; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,8,2,3,4,5,6,7] -; KNL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index d48f63536e0..b127585dc87 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -344,8 +344,7 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { ; KNL-NEXT: LBB17_1: ; KNL-NEXT: vpcmpgtd %zmm2, %zmm0, %k1 ; KNL-NEXT: LBB17_3: -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -382,8 +381,7 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) { ; KNL-NEXT: LBB18_3: ; KNL-NEXT: vpslld $31, %zmm0, %zmm0 ; KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -472,8 +470,7 @@ define <16 x i1> @test15(i32 %x, i32 %y) { ; KNL-NEXT: movw $1, %cx ; KNL-NEXT: cmovgw %ax, %cx ; KNL-NEXT: kmovw %ecx, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -510,28 +507,27 @@ define <64 x i8> @test16(i64 %x) { ; KNL-NEXT: movl %edi, (%rsp) ; KNL-NEXT: shrq $32, %rdi ; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp) -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; KNL-NEXT: kmovw (%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpmovdb %zmm0, %xmm0 +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm2 {%k2} {z} -; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm2 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; KNL-NEXT: movl $1, %eax -; KNL-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1 -; KNL-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] +; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 +; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z} -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; KNL-NEXT: vpsllw $7, %ymm2, %ymm0 -; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; KNL-NEXT: vpmovdb %zmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq @@ -574,30 +570,29 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; KNL-NEXT: movl %edi, (%rsp) ; KNL-NEXT: shrq $32, %rdi ; KNL-NEXT: movl %edi, {{[0-9]+}}(%rsp) -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 ; KNL-NEXT: kmovw (%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} {z} -; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 ; KNL-NEXT: xorl %eax, %eax ; KNL-NEXT: cmpl %edx, %esi ; KNL-NEXT: setg %al ; KNL-NEXT: vpinsrb $5, %eax, %xmm0, %xmm0 -; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; KNL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; KNL-NEXT: vpsllw $7, %ymm0, %ymm0 ; KNL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; KNL-NEXT: vpcmpgtb %ymm0, %ymm2, %ymm0 +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ; KNL-NEXT: kmovw {{[0-9]+}}(%rsp), %k2 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z} -; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z} +; KNL-NEXT: vpmovdb %zmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; KNL-NEXT: movq %rbp, %rsp ; KNL-NEXT: popq %rbp ; KNL-NEXT: retq @@ -635,18 +630,17 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: kshiftlw $6, %k2, %k2 ; KNL-NEXT: kshiftrw $15, %k2, %k2 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} -; KNL-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} {z} -; KNL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,1,2,3,4,5,8,7] -; KNL-NEXT: vpermi2q %zmm2, %zmm1, %zmm3 -; KNL-NEXT: vpsllq $63, %zmm3, %zmm1 -; KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] +; KNL-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; KNL-NEXT: vpsllq $63, %zmm2, %zmm0 +; KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 ; KNL-NEXT: kshiftrw $1, %k1, %k1 ; KNL-NEXT: kshiftlw $7, %k0, %k0 ; KNL-NEXT: korw %k0, %k1, %k1 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqw %zmm0, %xmm0 ; KNL-NEXT: retq ; @@ -1387,8 +1381,7 @@ define <8 x i64> @load_8i1(<8 x i1>* %a) { ; KNL: ## BB#0: ; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: load_8i1: @@ -1405,8 +1398,7 @@ define <16 x i32> @load_16i1(<16 x i1>* %a) { ; KNL-LABEL: load_16i1: ; KNL: ## BB#0: ; KNL-NEXT: kmovw (%rdi), %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: retq ; ; SKX-LABEL: load_16i1: @@ -1424,8 +1416,7 @@ define <2 x i16> @load_2i1(<2 x i1>* %a) { ; KNL: ## BB#0: ; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 ; KNL-NEXT: retq ; @@ -1444,8 +1435,7 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) { ; KNL: ## BB#0: ; KNL-NEXT: movzbl (%rdi), %eax ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: ## kill: %XMM0 %XMM0 %YMM0 ; KNL-NEXT: retq @@ -1465,10 +1455,9 @@ define <32 x i16> @load_32i1(<32 x i1>* %a) { ; KNL: ## BB#0: ; KNL-NEXT: kmovw (%rdi), %k1 ; KNL-NEXT: kmovw 2(%rdi), %k2 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k2} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vpmovdw %zmm1, %ymm1 ; KNL-NEXT: retq ; @@ -1489,17 +1478,16 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) { ; KNL-NEXT: kmovw 2(%rdi), %k2 ; KNL-NEXT: kmovw 4(%rdi), %k3 ; KNL-NEXT: kmovw 6(%rdi), %k4 -; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k2} {z} -; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm2 {%k3} {z} -; KNL-NEXT: vpmovdb %zmm2, %xmm2 -; KNL-NEXT: vmovdqa32 %zmm1, %zmm1 {%k4} {z} +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} ; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 +; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k3} {z} +; KNL-NEXT: vpmovdb %zmm1, %xmm1 +; KNL-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k4} {z} +; KNL-NEXT: vpmovdb %zmm2, %xmm2 +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; KNL-NEXT: retq ; ; SKX-LABEL: load_64i1: diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll index 840239b9011..1991ee4f337 100644 --- a/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/test/CodeGen/X86/avx512-vbroadcast.ll @@ -218,8 +218,7 @@ define <16 x i32> @test_vbroadcast() { ; ALL: # BB#0: # %entry ; ALL-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; ALL-NEXT: vcmpunordps %zmm0, %zmm0, %k1 -; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; ALL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; ALL-NEXT: knotw %k1, %k1 ; ALL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} ; ALL-NEXT: retq diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll index 5a71053ebca..391d2a38853 100644 --- a/test/CodeGen/X86/avx512-vec-cmp.ll +++ b/test/CodeGen/X86/avx512-vec-cmp.ll @@ -880,8 +880,7 @@ define <8 x i32>@test28(<8 x i64> %x, <8 x i64> %y, <8 x i64> %x1, <8 x i64> %y1 ; KNL-NEXT: vpcmpgtq %zmm1, %zmm0, %k0 ; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 ; KNL-NEXT: kxnorw %k1, %k0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovqd %zmm0, %ymm0 ; KNL-NEXT: retq ; @@ -905,8 +904,7 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32> ; KNL-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 ; KNL-NEXT: vpcmpgtd %zmm3, %zmm2, %k1 ; KNL-NEXT: kxorw %k1, %k0, %k1 -; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; KNL-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: retq ; diff --git a/test/CodeGen/X86/sse-fsignum.ll b/test/CodeGen/X86/sse-fsignum.ll index 7159d4c8717..32594a27698 100644 --- a/test/CodeGen/X86/sse-fsignum.ll +++ b/test/CodeGen/X86/sse-fsignum.ll @@ -93,15 +93,14 @@ define void @signum32b(<8 x float>*) { ; AVX512F-NEXT: vmovaps (%rdi), %ymm0 ; AVX512F-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; AVX512F-NEXT: vcmpltps %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} {z} -; AVX512F-NEXT: vpmovqd %zmm3, %ymm3 -; AVX512F-NEXT: vcvtdq2ps %ymm3, %ymm3 +; AVX512F-NEXT: vpternlogq $255, %zmm2, %zmm2, %zmm2 {%k1} {z} +; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512F-NEXT: vcvtdq2ps %ymm2, %ymm2 ; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512F-NEXT: vsubps %ymm0, %ymm3, %ymm0 +; AVX512F-NEXT: vsubps %ymm0, %ymm2, %ymm0 ; AVX512F-NEXT: vmovaps %ymm0, (%rdi) ; AVX512F-NEXT: retq entry: diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll index 42e47f3ef12..c34f333ef78 100644 --- a/test/CodeGen/X86/vector-compare-results.ll +++ b/test/CodeGen/X86/vector-compare-results.ll @@ -608,8 +608,7 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { ; AVX512F-LABEL: test_cmp_v8f64: ; AVX512F: # BB#0: ; AVX512F-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -623,8 +622,7 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind { ; AVX512BW-LABEL: test_cmp_v8f64: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: retq %1 = fcmp ogt <8 x double> %a0, %a1 @@ -670,8 +668,7 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { ; AVX512F-LABEL: test_cmp_v16f32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -685,8 +682,7 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind { ; AVX512BW-LABEL: test_cmp_v16f32: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: retq %1 = fcmp ogt <16 x float> %a0, %a1 @@ -783,8 +779,7 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX512F-LABEL: test_cmp_v8i64: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -798,8 +793,7 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; AVX512BW-LABEL: test_cmp_v8i64: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpcmpgtq %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: retq %1 = icmp sgt <8 x i64> %a0, %a1 @@ -848,8 +842,7 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind { ; AVX512F-LABEL: test_cmp_v16i32: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -863,8 +856,7 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind { ; AVX512BW-LABEL: test_cmp_v16i32: ; AVX512BW: # BB#0: ; AVX512BW-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512BW-NEXT: retq %1 = icmp sgt <16 x i32> %a0, %a1 diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index 39fbc7611de..774d615ae89 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -1244,8 +1244,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512F-NEXT: retq ; @@ -1253,8 +1252,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512BW-NEXT: retq ; @@ -1435,8 +1433,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512F-NEXT: retq @@ -1445,8 +1442,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; AVX512BW-NEXT: retq @@ -1642,8 +1638,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512F-NEXT: retq ; @@ -1651,8 +1646,7 @@ define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512BW-NEXT: retq ; @@ -1945,8 +1939,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -1954,8 +1947,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: retq ; @@ -2348,8 +2340,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { ; AVX512F: # BB#0: # %entry ; AVX512F-NEXT: movzbl (%rdi), %eax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: retq ; @@ -2357,8 +2348,7 @@ define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { ; AVX512BW: # BB#0: # %entry ; AVX512BW-NEXT: movzbl (%rdi), %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: retq ; @@ -2860,8 +2850,7 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { ; AVX512-LABEL: load_sext_16i1_to_16i8: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512-NEXT: retq ; @@ -3398,8 +3387,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { ; AVX512-LABEL: load_sext_16i1_to_16i16: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512-NEXT: retq ; @@ -4244,12 +4232,11 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { ; AVX512: # BB#0: # %entry ; AVX512-NEXT: kmovw (%rdi), %k1 ; AVX512-NEXT: kmovw 2(%rdi), %k2 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} {z} -; AVX512-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512-NEXT: vmovdqa32 %zmm0, %zmm0 {%k2} {z} +; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512-NEXT: retq ; ; X32-SSE41-LABEL: load_sext_32i1_to_32i8: diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll index 3ad92737a2e..4312b67546d 100644 --- a/test/CodeGen/X86/vector-shuffle-v1.ll +++ b/test/CodeGen/X86/vector-shuffle-v1.ll @@ -71,13 +71,12 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> % ; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,6,1,0,3,7,7,0] -; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] +; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -101,14 +100,13 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1 ; AVX512F: # BB#0: ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k2} {z} -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} {z} -; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] -; AVX512F-NEXT: vpermi2d %zmm1, %zmm2, %zmm3 -; AVX512F-NEXT: vpslld $31, %zmm3, %zmm1 -; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} +; AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] +; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpslld $31, %zmm2, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -157,13 +155,12 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} -; AVX512F-NEXT: vextracti32x4 $1, %zmm1, %xmm1 -; AVX512F-NEXT: vpbroadcastq %xmm1, %zmm1 -; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 -; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 +; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: retq ; @@ -185,8 +182,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u> ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 @@ -215,8 +211,7 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { ; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1] ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 @@ -241,8 +236,7 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { ; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] ; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 @@ -271,8 +265,7 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { ; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] ; AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 @@ -301,13 +294,12 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { ; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: movb $51, %al ; AVX512F-NEXT: kmovw %eax, %k2 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} {z} -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} +; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 +; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -337,10 +329,10 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { ; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} {z} -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [9,1,2,3,4,5,6,7] -; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] +; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 +; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 ; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 ; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 ; AVX512F-NEXT: kmovw %k0, %eax @@ -367,8 +359,7 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { ; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: ; AVX512F: # BB#0: ; AVX512F-NEXT: kmovw %edi, %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 ; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 @@ -403,9 +394,8 @@ define i64 @shuf64i1_zero(i64 %a) { ; AVX512F-NEXT: andq $-32, %rsp ; AVX512F-NEXT: subq $96, %rsp ; AVX512F-NEXT: movl %edi, {{[0-9]+}}(%rsp) -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 ; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1