From 1690198db2bd0e32593340e4f0c0ea7cea63c761 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 4 Oct 2019 18:02:46 +0000 Subject: [PATCH] [X86] Remove isel patterns for mask vpcmpgt/vpcmpeq. Switch vpcmp to these based on the immediate in MCInstLower The immediate form of VPCMP can represent these completely. The vpcmpgt/eq are just shorter encodings. This patch removes the isel patterns and just swaps the opcodes and removes the immediate in MCInstLower. This matches where we do some other encodings tricks. Removes over 10K bytes from the isel table. Differential Revision: https://reviews.llvm.org/D68446 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373766 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 178 ++++++----------------------- lib/Target/X86/X86MCInstLower.cpp | 175 ++++++++++++++++++++++++++++ test/CodeGen/X86/avx512-mask-op.ll | 40 +++---- 3 files changed, 227 insertions(+), 166 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 490d0c3048f..9b5de59430a 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2145,93 +2145,82 @@ let Predicates = [HasAVX512] in { SchedWriteFCmp.Scl>, AVX512XDIi8Base, VEX_W; } -multiclass avx512_icmp_packed opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, X86FoldableSchedWrite sched, +multiclass avx512_icmp_packed opc, string OpcodeStr, + X86FoldableSchedWrite sched, X86VectorVTInfo _, bit IsCommutable> { - let isCommutable = IsCommutable in + let isCommutable = IsCommutable, hasSideEffects = 0 in def rr : AVX512BI, - EVEX_4V, Sched<[sched]>; + []>, EVEX_4V, Sched<[sched]>; + let mayLoad = 1, hasSideEffects = 0 in def rm : AVX512BI, - EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; - let isCommutable = IsCommutable in + []>, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>; + let isCommutable = IsCommutable, hasSideEffects = 0 in def rrk : AVX512BI, - EVEX_4V, EVEX_K, Sched<[sched]>; + []>, EVEX_4V, EVEX_K, Sched<[sched]>; + let mayLoad = 1, hasSideEffects = 0 in def rmk : AVX512BI, - EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; + []>, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } -multiclass avx512_icmp_packed_rmb opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, +multiclass avx512_icmp_packed_rmb opc, string OpcodeStr, X86FoldableSchedWrite sched, X86VectorVTInfo _, bit IsCommutable> : - avx512_icmp_packed { + avx512_icmp_packed { + let mayLoad = 1, hasSideEffects = 0 in { def rmb : AVX512BI, - EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + []>, EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; def rmbk : AVX512BI, - EVEX_4V, EVEX_K, EVEX_B, + []>, EVEX_4V, EVEX_K, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; + } } -multiclass avx512_icmp_packed_vl opc, string OpcodeStr, PatFrag OpNode, - PatFrag OpNode_su, X86SchedWriteWidths sched, +multiclass avx512_icmp_packed_vl opc, string OpcodeStr, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed, EVEX_V256; - defm Z128 : avx512_icmp_packed, EVEX_V128; } } multiclass avx512_icmp_packed_rmb_vl opc, string OpcodeStr, - PatFrag OpNode, PatFrag OpNode_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed_rmb, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed_rmb, EVEX_V256; - defm Z128 : avx512_icmp_packed_rmb, EVEX_V128; } } @@ -2239,53 +2228,42 @@ multiclass avx512_icmp_packed_rmb_vl opc, string OpcodeStr, // This fragment treats X86cmpm as commutable to help match loads in both // operands for PCMPEQ. def X86setcc_commute : SDNode<"ISD::SETCC", SDTSetCC, [SDNPCommutative]>; -def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2), - (X86setcc_commute node:$src1, node:$src2, SETEQ)>; def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2), (setcc node:$src1, node:$src2, SETGT)>; -def X86pcmpeqm_c_su : PatFrag<(ops node:$src1, node:$src2), - (X86pcmpeqm_c node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; -def X86pcmpgtm_su : PatFrag<(ops node:$src1, node:$src2), - (X86pcmpgtm node:$src1, node:$src2), [{ - return N->hasOneUse(); -}]>; - // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't // increase the pattern complexity the way an immediate would. let AddedComplexity = 2 in { // FIXME: Is there a better scheduler class for VPCMP? -defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>, EVEX_CD8<32, CD8VF>; -defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, X86pcmpeqm_c_su, +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", SchedWriteVecALU, avx512vl_i16_info, HasBWI>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, X86pcmpgtm_su, +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; } @@ -3148,54 +3126,6 @@ multiclass avx512_mask_shiftop_w opc1, bits<8> opc2, string OpcodeStr, defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, WriteShuffle>; defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, WriteShuffle>; -// Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. -multiclass axv512_icmp_packed_no_vlx_lowering { - def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2))), - (COPY_TO_REGCLASS - (!cast(InstStr#"Zrr") - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))), - Narrow.KRC)>; - - def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (Frag_su (Narrow.VT Narrow.RC:$src1), - (Narrow.VT Narrow.RC:$src2)))), - (COPY_TO_REGCLASS - (!cast(InstStr#"Zrrk") - (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src2, Narrow.SubRegIdx))), - Narrow.KRC)>; -} - -multiclass axv512_icmp_packed_rmb_no_vlx_lowering { - // Broadcast load. - def : Pat<(Narrow.KVT (Frag (Narrow.VT Narrow.RC:$src1), - (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)))), - (COPY_TO_REGCLASS - (!cast(InstStr#"Zrmb") - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), - addr:$src2), - Narrow.KRC)>; - - def : Pat<(Narrow.KVT (and Narrow.KRC:$mask, - (Frag_su (Narrow.VT Narrow.RC:$src1), - (Narrow.BroadcastLdFrag addr:$src2)))), - (COPY_TO_REGCLASS - (!cast(InstStr#"Zrmbk") - (COPY_TO_REGCLASS Narrow.KRC:$mask, Wide.KRC), - (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)), - addr:$src2), - Narrow.KRC)>; -} - // Patterns for comparing 128/256-bit integer vectors using 512-bit instruction. multiclass axv512_icmp_packed_cc_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - defm : axv512_icmp_packed_rmb_no_vlx_lowering; - } - defm : axv512_icmp_packed_cc_no_vlx_lowering; defm : axv512_icmp_packed_cc_no_vlx_lowering; @@ -3377,22 +3279,6 @@ let Predicates = [HasAVX512, NoVLX] in { } let Predicates = [HasBWI, NoVLX] in { - // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't - // increase the pattern complexity the way an immediate would. - let AddedComplexity = 2 in { - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - - defm : axv512_icmp_packed_no_vlx_lowering; - defm : axv512_icmp_packed_no_vlx_lowering; - } - defm : axv512_icmp_packed_cc_no_vlx_lowering; defm : axv512_icmp_packed_cc_no_vlx_lowering; diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 0726b91c196..78098fd6262 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -535,6 +535,181 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const { break; } + case X86::VPCMPBZ128rmi: case X86::VPCMPBZ128rmik: + case X86::VPCMPBZ128rri: case X86::VPCMPBZ128rrik: + case X86::VPCMPBZ256rmi: case X86::VPCMPBZ256rmik: + case X86::VPCMPBZ256rri: case X86::VPCMPBZ256rrik: + case X86::VPCMPBZrmi: case X86::VPCMPBZrmik: + case X86::VPCMPBZrri: case X86::VPCMPBZrrik: + case X86::VPCMPDZ128rmi: case X86::VPCMPDZ128rmik: + case X86::VPCMPDZ128rmib: case X86::VPCMPDZ128rmibk: + case X86::VPCMPDZ128rri: case X86::VPCMPDZ128rrik: + case X86::VPCMPDZ256rmi: case X86::VPCMPDZ256rmik: + case X86::VPCMPDZ256rmib: case X86::VPCMPDZ256rmibk: + case X86::VPCMPDZ256rri: case X86::VPCMPDZ256rrik: + case X86::VPCMPDZrmi: case X86::VPCMPDZrmik: + case X86::VPCMPDZrmib: case X86::VPCMPDZrmibk: + case X86::VPCMPDZrri: case X86::VPCMPDZrrik: + case X86::VPCMPQZ128rmi: case X86::VPCMPQZ128rmik: + case X86::VPCMPQZ128rmib: case X86::VPCMPQZ128rmibk: + case X86::VPCMPQZ128rri: case X86::VPCMPQZ128rrik: + case X86::VPCMPQZ256rmi: case X86::VPCMPQZ256rmik: + case X86::VPCMPQZ256rmib: case X86::VPCMPQZ256rmibk: + case X86::VPCMPQZ256rri: case X86::VPCMPQZ256rrik: + case X86::VPCMPQZrmi: case X86::VPCMPQZrmik: + case X86::VPCMPQZrmib: case X86::VPCMPQZrmibk: + case X86::VPCMPQZrri: case X86::VPCMPQZrrik: + case X86::VPCMPWZ128rmi: case X86::VPCMPWZ128rmik: + case X86::VPCMPWZ128rri: case X86::VPCMPWZ128rrik: + case X86::VPCMPWZ256rmi: case X86::VPCMPWZ256rmik: + case X86::VPCMPWZ256rri: case X86::VPCMPWZ256rrik: + case X86::VPCMPWZrmi: case X86::VPCMPWZrmik: + case X86::VPCMPWZrri: case X86::VPCMPWZrrik: { + // Turn immediate 0 into the VPCMPEQ instruction. + if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 0) { + unsigned NewOpc; + switch (OutMI.getOpcode()) { + case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPEQBZ128rm; break; + case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPEQBZ128rmk; break; + case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPEQBZ128rr; break; + case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPEQBZ128rrk; break; + case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPEQBZ256rm; break; + case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPEQBZ256rmk; break; + case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPEQBZ256rr; break; + case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPEQBZ256rrk; break; + case X86::VPCMPBZrmi: NewOpc = X86::VPCMPEQBZrm; break; + case X86::VPCMPBZrmik: NewOpc = X86::VPCMPEQBZrmk; break; + case X86::VPCMPBZrri: NewOpc = X86::VPCMPEQBZrr; break; + case X86::VPCMPBZrrik: NewOpc = X86::VPCMPEQBZrrk; break; + case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPEQDZ128rm; break; + case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPEQDZ128rmb; break; + case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPEQDZ128rmbk; break; + case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPEQDZ128rmk; break; + case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPEQDZ128rr; break; + case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPEQDZ128rrk; break; + case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPEQDZ256rm; break; + case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPEQDZ256rmb; break; + case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPEQDZ256rmbk; break; + case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPEQDZ256rmk; break; + case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPEQDZ256rr; break; + case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPEQDZ256rrk; break; + case X86::VPCMPDZrmi: NewOpc = X86::VPCMPEQDZrm; break; + case X86::VPCMPDZrmib: NewOpc = X86::VPCMPEQDZrmb; break; + case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPEQDZrmbk; break; + case X86::VPCMPDZrmik: NewOpc = X86::VPCMPEQDZrmk; break; + case X86::VPCMPDZrri: NewOpc = X86::VPCMPEQDZrr; break; + case X86::VPCMPDZrrik: NewOpc = X86::VPCMPEQDZrrk; break; + case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPEQQZ128rm; break; + case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPEQQZ128rmb; break; + case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPEQQZ128rmbk; break; + case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPEQQZ128rmk; break; + case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPEQQZ128rr; break; + case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPEQQZ128rrk; break; + case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPEQQZ256rm; break; + case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPEQQZ256rmb; break; + case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPEQQZ256rmbk; break; + case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPEQQZ256rmk; break; + case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPEQQZ256rr; break; + case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPEQQZ256rrk; break; + case X86::VPCMPQZrmi: NewOpc = X86::VPCMPEQQZrm; break; + case X86::VPCMPQZrmib: NewOpc = X86::VPCMPEQQZrmb; break; + case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPEQQZrmbk; break; + case X86::VPCMPQZrmik: NewOpc = X86::VPCMPEQQZrmk; break; + case X86::VPCMPQZrri: NewOpc = X86::VPCMPEQQZrr; break; + case X86::VPCMPQZrrik: NewOpc = X86::VPCMPEQQZrrk; break; + case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPEQWZ128rm; break; + case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPEQWZ128rmk; break; + case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPEQWZ128rr; break; + case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPEQWZ128rrk; break; + case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPEQWZ256rm; break; + case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPEQWZ256rmk; break; + case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPEQWZ256rr; break; + case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPEQWZ256rrk; break; + case X86::VPCMPWZrmi: NewOpc = X86::VPCMPEQWZrm; break; + case X86::VPCMPWZrmik: NewOpc = X86::VPCMPEQWZrmk; break; + case X86::VPCMPWZrri: NewOpc = X86::VPCMPEQWZrr; break; + case X86::VPCMPWZrrik: NewOpc = X86::VPCMPEQWZrrk; break; + } + + OutMI.setOpcode(NewOpc); + OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1)); + break; + } + + // Turn immediate 6 into the VPCMPGT instruction. + if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 6) { + unsigned NewOpc; + switch (OutMI.getOpcode()) { + case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPGTBZ128rm; break; + case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPGTBZ128rmk; break; + case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPGTBZ128rr; break; + case X86::VPCMPBZ128rrik: NewOpc = X86::VPCMPGTBZ128rrk; break; + case X86::VPCMPBZ256rmi: NewOpc = X86::VPCMPGTBZ256rm; break; + case X86::VPCMPBZ256rmik: NewOpc = X86::VPCMPGTBZ256rmk; break; + case X86::VPCMPBZ256rri: NewOpc = X86::VPCMPGTBZ256rr; break; + case X86::VPCMPBZ256rrik: NewOpc = X86::VPCMPGTBZ256rrk; break; + case X86::VPCMPBZrmi: NewOpc = X86::VPCMPGTBZrm; break; + case X86::VPCMPBZrmik: NewOpc = X86::VPCMPGTBZrmk; break; + case X86::VPCMPBZrri: NewOpc = X86::VPCMPGTBZrr; break; + case X86::VPCMPBZrrik: NewOpc = X86::VPCMPGTBZrrk; break; + case X86::VPCMPDZ128rmi: NewOpc = X86::VPCMPGTDZ128rm; break; + case X86::VPCMPDZ128rmib: NewOpc = X86::VPCMPGTDZ128rmb; break; + case X86::VPCMPDZ128rmibk: NewOpc = X86::VPCMPGTDZ128rmbk; break; + case X86::VPCMPDZ128rmik: NewOpc = X86::VPCMPGTDZ128rmk; break; + case X86::VPCMPDZ128rri: NewOpc = X86::VPCMPGTDZ128rr; break; + case X86::VPCMPDZ128rrik: NewOpc = X86::VPCMPGTDZ128rrk; break; + case X86::VPCMPDZ256rmi: NewOpc = X86::VPCMPGTDZ256rm; break; + case X86::VPCMPDZ256rmib: NewOpc = X86::VPCMPGTDZ256rmb; break; + case X86::VPCMPDZ256rmibk: NewOpc = X86::VPCMPGTDZ256rmbk; break; + case X86::VPCMPDZ256rmik: NewOpc = X86::VPCMPGTDZ256rmk; break; + case X86::VPCMPDZ256rri: NewOpc = X86::VPCMPGTDZ256rr; break; + case X86::VPCMPDZ256rrik: NewOpc = X86::VPCMPGTDZ256rrk; break; + case X86::VPCMPDZrmi: NewOpc = X86::VPCMPGTDZrm; break; + case X86::VPCMPDZrmib: NewOpc = X86::VPCMPGTDZrmb; break; + case X86::VPCMPDZrmibk: NewOpc = X86::VPCMPGTDZrmbk; break; + case X86::VPCMPDZrmik: NewOpc = X86::VPCMPGTDZrmk; break; + case X86::VPCMPDZrri: NewOpc = X86::VPCMPGTDZrr; break; + case X86::VPCMPDZrrik: NewOpc = X86::VPCMPGTDZrrk; break; + case X86::VPCMPQZ128rmi: NewOpc = X86::VPCMPGTQZ128rm; break; + case X86::VPCMPQZ128rmib: NewOpc = X86::VPCMPGTQZ128rmb; break; + case X86::VPCMPQZ128rmibk: NewOpc = X86::VPCMPGTQZ128rmbk; break; + case X86::VPCMPQZ128rmik: NewOpc = X86::VPCMPGTQZ128rmk; break; + case X86::VPCMPQZ128rri: NewOpc = X86::VPCMPGTQZ128rr; break; + case X86::VPCMPQZ128rrik: NewOpc = X86::VPCMPGTQZ128rrk; break; + case X86::VPCMPQZ256rmi: NewOpc = X86::VPCMPGTQZ256rm; break; + case X86::VPCMPQZ256rmib: NewOpc = X86::VPCMPGTQZ256rmb; break; + case X86::VPCMPQZ256rmibk: NewOpc = X86::VPCMPGTQZ256rmbk; break; + case X86::VPCMPQZ256rmik: NewOpc = X86::VPCMPGTQZ256rmk; break; + case X86::VPCMPQZ256rri: NewOpc = X86::VPCMPGTQZ256rr; break; + case X86::VPCMPQZ256rrik: NewOpc = X86::VPCMPGTQZ256rrk; break; + case X86::VPCMPQZrmi: NewOpc = X86::VPCMPGTQZrm; break; + case X86::VPCMPQZrmib: NewOpc = X86::VPCMPGTQZrmb; break; + case X86::VPCMPQZrmibk: NewOpc = X86::VPCMPGTQZrmbk; break; + case X86::VPCMPQZrmik: NewOpc = X86::VPCMPGTQZrmk; break; + case X86::VPCMPQZrri: NewOpc = X86::VPCMPGTQZrr; break; + case X86::VPCMPQZrrik: NewOpc = X86::VPCMPGTQZrrk; break; + case X86::VPCMPWZ128rmi: NewOpc = X86::VPCMPGTWZ128rm; break; + case X86::VPCMPWZ128rmik: NewOpc = X86::VPCMPGTWZ128rmk; break; + case X86::VPCMPWZ128rri: NewOpc = X86::VPCMPGTWZ128rr; break; + case X86::VPCMPWZ128rrik: NewOpc = X86::VPCMPGTWZ128rrk; break; + case X86::VPCMPWZ256rmi: NewOpc = X86::VPCMPGTWZ256rm; break; + case X86::VPCMPWZ256rmik: NewOpc = X86::VPCMPGTWZ256rmk; break; + case X86::VPCMPWZ256rri: NewOpc = X86::VPCMPGTWZ256rr; break; + case X86::VPCMPWZ256rrik: NewOpc = X86::VPCMPGTWZ256rrk; break; + case X86::VPCMPWZrmi: NewOpc = X86::VPCMPGTWZrm; break; + case X86::VPCMPWZrmik: NewOpc = X86::VPCMPGTWZrmk; break; + case X86::VPCMPWZrri: NewOpc = X86::VPCMPGTWZrr; break; + case X86::VPCMPWZrrik: NewOpc = X86::VPCMPGTWZrrk; break; + } + + OutMI.setOpcode(NewOpc); + OutMI.erase(&OutMI.getOperand(OutMI.getNumOperands() - 1)); + break; + } + + break; + } + // CALL64r, CALL64pcrel32 - These instructions used to have // register inputs modeled as normal uses instead of implicit uses. As such, // they we used to truncate off all but the first operand (the callee). This diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index 19f9a2a2bd5..8c86d957d4c 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -484,8 +484,8 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 ; KNL-NEXT: ## kill: def $ymm2 killed $ymm2 def $zmm2 ; KNL-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpcmpleq %zmm1, %zmm0, %k1 -; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 {%k1} +; KNL-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 +; KNL-NEXT: vpcmpleq %zmm1, %zmm0, %k1 {%k1} ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; KNL-NEXT: vzeroupper @@ -493,8 +493,8 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 ; ; SKX-LABEL: test4: ; SKX: ## %bb.0: -; SKX-NEXT: vpcmpleq %ymm1, %ymm0, %k1 -; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1} +; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 +; SKX-NEXT: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ; SKX-NEXT: vpmovm2d %k0, %xmm0 ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -505,8 +505,8 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 ; AVX512BW-NEXT: ## kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512BW-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512BW-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpcmpleq %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 {%k1} +; AVX512BW-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 +; AVX512BW-NEXT: vpcmpleq %zmm1, %zmm0, %k1 {%k1} ; AVX512BW-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -518,8 +518,8 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 ; AVX512DQ-NEXT: ## kill: def $ymm2 killed $ymm2 def $zmm2 ; AVX512DQ-NEXT: ## kill: def $ymm1 killed $ymm1 def $zmm1 ; AVX512DQ-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vpcmpleq %zmm1, %zmm0, %k1 -; AVX512DQ-NEXT: vpcmpgtq %zmm3, %zmm2, %k0 {%k1} +; AVX512DQ-NEXT: vpcmpgtq %zmm3, %zmm2, %k1 +; AVX512DQ-NEXT: vpcmpleq %zmm1, %zmm0, %k0 {%k1} ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 ; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -527,8 +527,8 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 ; ; X86-LABEL: test4: ; X86: ## %bb.0: -; X86-NEXT: vpcmpleq %ymm1, %ymm0, %k1 -; X86-NEXT: vpcmpgtq %ymm3, %ymm2, %k0 {%k1} +; X86-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 +; X86-NEXT: vpcmpleq %ymm1, %ymm0, %k0 {%k1} ; X86-NEXT: vpmovm2d %k0, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -546,8 +546,8 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1 ; KNL-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2 ; KNL-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; KNL-NEXT: vpcmpleq %zmm3, %zmm2, %k1 -; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 {%k1} +; KNL-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 +; KNL-NEXT: vpcmpleq %zmm3, %zmm2, %k1 {%k1} ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; KNL-NEXT: vzeroupper @@ -555,8 +555,8 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1 ; ; SKX-LABEL: test5: ; SKX: ## %bb.0: -; SKX-NEXT: vpcmpleq %xmm3, %xmm2, %k1 -; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 {%k1} +; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 +; SKX-NEXT: vpcmpleq %xmm3, %xmm2, %k0 {%k1} ; SKX-NEXT: vpmovm2q %k0, %xmm0 ; SKX-NEXT: retq ; @@ -566,8 +566,8 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1 ; AVX512BW-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512BW-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpcmpleq %zmm3, %zmm2, %k1 -; AVX512BW-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 {%k1} +; AVX512BW-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 +; AVX512BW-NEXT: vpcmpleq %zmm3, %zmm2, %k1 {%k1} ; AVX512BW-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512BW-NEXT: vzeroupper @@ -579,8 +579,8 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1 ; AVX512DQ-NEXT: ## kill: def $xmm2 killed $xmm2 def $zmm2 ; AVX512DQ-NEXT: ## kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpcmpleq %zmm3, %zmm2, %k1 -; AVX512DQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k0 {%k1} +; AVX512DQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 +; AVX512DQ-NEXT: vpcmpleq %zmm3, %zmm2, %k0 {%k1} ; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0 ; AVX512DQ-NEXT: ## kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512DQ-NEXT: vzeroupper @@ -588,8 +588,8 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1 ; ; X86-LABEL: test5: ; X86: ## %bb.0: -; X86-NEXT: vpcmpleq %xmm3, %xmm2, %k1 -; X86-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 {%k1} +; X86-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 +; X86-NEXT: vpcmpleq %xmm3, %xmm2, %k0 {%k1} ; X86-NEXT: vpmovm2q %k0, %xmm0 ; X86-NEXT: retl %x_gt_y = icmp slt <2 x i64> %x, %y -- 2.40.0