From 9e5a8cbba139e50aaa69641aa9956c1be8b84bca Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 14 Apr 2019 18:26:06 +0000 Subject: [PATCH] [X86] Don't form masked vpcmp/vcmp/vptestm operations if the setcc node has more than one use. We're better of emitting a single compare + kand rather than a compare for the other use and a masked compare. I'm looking into using custom instruction selection for VPTESTM to reduce the ridiculous number of permutations of patterns in the isel table. Putting a one use check on all masked compare folding makes load fold matching in the custom code easier. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@358358 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 404 +++++++++++------- .../X86/avx512vl-intrinsics-upgrade.ll | 60 +-- test/CodeGen/X86/vec_uaddo.ll | 13 +- test/CodeGen/X86/vector-compare-all_of.ll | 6 +- 4 files changed, 295 insertions(+), 188 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 6e6c8f10c09..4403f986b23 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -388,11 +388,11 @@ multiclass AVX512_maskable_common_cmp O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable_cmp O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, bit IsCommutable = 0> : + dag RHS, dag RHS_su, bit IsCommutable = 0> : AVX512_maskable_common_cmp; + (and _.KRCWM:$mask, RHS_su), IsCommutable>; // Alias instruction that maps zero vector to pxor / xorp* for AVX-512. @@ -2020,15 +2020,16 @@ defm VPBLENDMW : blendmask_bw<0x66, "vpblendmw", SchedWriteVarBlend, // avx512_cmp_scalar - AVX512 CMPSS and CMPSD multiclass avx512_cmp_scalar { defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>; + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>; let mayLoad = 1 in defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), @@ -2036,6 +2037,8 @@ multiclass avx512_cmp_scalar, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -2044,9 +2047,10 @@ multiclass avx512_cmp_scalar, + (OpNodeSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc), + (OpNodeSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), + imm:$cc)>, EVEX_4V, VEX_LIG, EVEX_B, Sched<[sched]>; let isCodeGenOnly = 1 in { @@ -2072,18 +2076,29 @@ multiclass avx512_cmp_scalarhasOneUse(); +}]>; +def X86cmpmsSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (X86cmpmsSAE node:$src1, node:$src2, node:$cc), [{ + return N->hasOneUse(); +}]>; + let Predicates = [HasAVX512] in { let ExeDomain = SSEPackedSingle in defm VCMPSSZ : avx512_cmp_scalar, AVX512XSIi8Base; let ExeDomain = SSEPackedDouble in defm VCMPSDZ : avx512_cmp_scalar, AVX512XDIi8Base, VEX_W; } multiclass avx512_icmp_packed opc, string OpcodeStr, PatFrag OpNode, - X86FoldableSchedWrite sched, X86VectorVTInfo _, - bit IsCommutable> { + PatFrag OpNode_su, X86FoldableSchedWrite sched, + X86VectorVTInfo _, bit IsCommutable> { let isCommutable = IsCommutable in def rr : AVX512BI opc, string OpcodeStr, PatFrag OpNode, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>, + (OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2))))]>, EVEX_4V, EVEX_K, Sched<[sched]>; def rmk : AVX512BI, EVEX_4V, EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>; } multiclass avx512_icmp_packed_rmb opc, string OpcodeStr, PatFrag OpNode, + PatFrag OpNode_su, X86FoldableSchedWrite sched, X86VectorVTInfo _, bit IsCommutable> : - avx512_icmp_packed { + avx512_icmp_packed { def rmb : AVX512BI opc, string OpcodeStr, PatFrag OpNode, "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), + (OpNode_su (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))]>, EVEX_4V, EVEX_K, EVEX_B, @@ -2140,33 +2156,34 @@ multiclass avx512_icmp_packed_rmb opc, string OpcodeStr, PatFrag OpNode, } multiclass avx512_icmp_packed_vl opc, string OpcodeStr, PatFrag OpNode, - X86SchedWriteWidths sched, + PatFrag OpNode_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed, EVEX_V256; - defm Z128 : avx512_icmp_packed, EVEX_V128; } } multiclass avx512_icmp_packed_rmb_vl opc, string OpcodeStr, - PatFrag OpNode, X86SchedWriteWidths sched, + PatFrag OpNode, PatFrag OpNode_su, + X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed_rmb, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed_rmb, EVEX_V256; - defm Z128 : avx512_icmp_packed_rmb, EVEX_V128; } } @@ -2179,45 +2196,55 @@ def X86pcmpeqm_c : PatFrag<(ops node:$src1, node:$src2), def X86pcmpgtm : PatFrag<(ops node:$src1, node:$src2), (setcc node:$src1, node:$src2, SETGT)>; +def X86pcmpeqm_c_su : PatFrag<(ops node:$src1, node:$src2), + (X86pcmpeqm_c node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; +def X86pcmpgtm_su : PatFrag<(ops node:$src1, node:$src2), + (X86pcmpgtm node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + // AddedComplexity is needed because the explicit SETEQ/SETGT CondCode doesn't // increase the pattern complexity the way an immediate would. let AddedComplexity = 2 in { // FIXME: Is there a better scheduler class for VPCMP? -defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, +defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm_c, X86pcmpeqm_c_su, SchedWriteVecALU, avx512vl_i8_info, HasBWI, 1>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, +defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm_c, X86pcmpeqm_c_su, SchedWriteVecALU, avx512vl_i16_info, HasBWI, 1>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, +defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm_c, X86pcmpeqm_c_su, SchedWriteVecALU, avx512vl_i32_info, HasAVX512, 1>, EVEX_CD8<32, CD8VF>; -defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, +defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm_c, X86pcmpeqm_c_su, SchedWriteVecALU, avx512vl_i64_info, HasAVX512, 1>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, +defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, X86pcmpgtm_su, SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>, VEX_WIG; -defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, +defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, X86pcmpgtm_su, SchedWriteVecALU, avx512vl_i16_info, HasBWI>, EVEX_CD8<16, CD8VF>, VEX_WIG; -defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, +defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, X86pcmpgtm_su, SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, +defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, X86pcmpgtm_su, SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; } multiclass avx512_icmp_cc opc, string Suffix, PatFrag Frag, - PatFrag CommFrag, X86FoldableSchedWrite sched, + PatFrag Frag_su, PatFrag CommFrag, PatFrag CommFrag_su, + X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> { let isCommutable = 1 in def rri : AVX512AIi8 opc, string Suffix, PatFrag Frag, "\t{$cc, $src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2, $cc}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (_.KVT (Frag:$cc (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - cond))))]>, + (_.KVT (Frag_su:$cc (_.VT _.RC:$src1), + (_.VT _.RC:$src2), + cond))))]>, EVEX_4V, EVEX_K, Sched<[sched]>; def rmik : AVX512AIi8 opc, string Suffix, PatFrag Frag, "$dst {${mask}}, $src1, $src2, $cc}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, (_.KVT - (Frag:$cc + (Frag_su:$cc (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), cond))))]>, @@ -2270,7 +2297,7 @@ multiclass avx512_icmp_cc opc, string Suffix, PatFrag Frag, _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; def : Pat<(and _.KRCWM:$mask, - (_.KVT (CommFrag:$cc (_.LdFrag addr:$src2), + (_.KVT (CommFrag_su:$cc (_.LdFrag addr:$src2), (_.VT _.RC:$src1), cond))), (!cast(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, @@ -2278,9 +2305,11 @@ multiclass avx512_icmp_cc opc, string Suffix, PatFrag Frag, } multiclass avx512_icmp_cc_rmb opc, string Suffix, PatFrag Frag, - PatFrag CommFrag, X86FoldableSchedWrite sched, + PatFrag Frag_su, PatFrag CommFrag, + PatFrag CommFrag_su, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> : - avx512_icmp_cc { + avx512_icmp_cc { def rmib : AVX512AIi8 opc, string Suffix, PatFrag Frag, "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, - (_.KVT (Frag:$cc + (_.KVT (Frag_su:$cc (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)), @@ -2313,7 +2342,7 @@ multiclass avx512_icmp_cc_rmb opc, string Suffix, PatFrag Frag, _.RC:$src1, addr:$src2, (CommFrag.OperandTransform $cc))>; def : Pat<(and _.KRCWM:$mask, - (_.KVT (CommFrag:$cc (X86VBroadcast + (_.KVT (CommFrag_su:$cc (X86VBroadcast (_.ScalarLdFrag addr:$src2)), (_.VT _.RC:$src1), cond))), (!cast(Name#_.ZSuffix#"rmibk") @@ -2322,32 +2351,34 @@ multiclass avx512_icmp_cc_rmb opc, string Suffix, PatFrag Frag, } multiclass avx512_icmp_cc_vl opc, string Suffix, PatFrag Frag, - PatFrag CommFrag, X86SchedWriteWidths sched, + PatFrag Frag_su, PatFrag CommFrag, + PatFrag CommFrag_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc, EVEX_V512; + defm Z : avx512_icmp_cc, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc, EVEX_V256; - defm Z128 : avx512_icmp_cc, EVEX_V128; + defm Z256 : avx512_icmp_cc, EVEX_V256; + defm Z128 : avx512_icmp_cc, EVEX_V128; } } multiclass avx512_icmp_cc_rmb_vl opc, string Suffix, PatFrag Frag, - PatFrag CommFrag, X86SchedWriteWidths sched, + PatFrag Frag_su, PatFrag CommFrag, + PatFrag CommFrag_su, X86SchedWriteWidths sched, AVX512VLVectorVTInfo VTInfo, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc_rmb, EVEX_V512; + defm Z : avx512_icmp_cc_rmb, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc_rmb, EVEX_V256; - defm Z128 : avx512_icmp_cc_rmb, EVEX_V128; + defm Z256 : avx512_icmp_cc_rmb, EVEX_V256; + defm Z128 : avx512_icmp_cc_rmb, EVEX_V128; } } @@ -2371,6 +2402,12 @@ def X86pcmpm : PatFrag<(ops node:$src1, node:$src2, node:$cc), return !ISD::isUnsignedIntSetCC(CC); }], X86pcmpm_imm>; +def X86pcmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast(N->getOperand(2))->get(); + return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm>; + // Same as above, but commutes immediate. Use for load folding. def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), (setcc node:$src1, node:$src2, node:$cc), [{ @@ -2378,12 +2415,24 @@ def X86pcmpm_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), return !ISD::isUnsignedIntSetCC(CC); }], X86pcmpm_imm_commute>; +def X86pcmpm_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast(N->getOperand(2))->get(); + return N->hasOneUse() && !ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm_commute>; + def X86pcmpum : PatFrag<(ops node:$src1, node:$src2, node:$cc), (setcc node:$src1, node:$src2, node:$cc), [{ ISD::CondCode CC = cast(N->getOperand(2))->get(); return ISD::isUnsignedIntSetCC(CC); }], X86pcmpm_imm>; +def X86pcmpum_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast(N->getOperand(2))->get(); + return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm>; + // Same as above, but commutes immediate. Use for load folding. def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), (setcc node:$src1, node:$src2, node:$cc), [{ @@ -2391,53 +2440,76 @@ def X86pcmpum_commute : PatFrag<(ops node:$src1, node:$src2, node:$cc), return ISD::isUnsignedIntSetCC(CC); }], X86pcmpm_imm_commute>; +def X86pcmpum_commute_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (setcc node:$src1, node:$src2, node:$cc), [{ + ISD::CondCode CC = cast(N->getOperand(2))->get(); + return N->hasOneUse() && ISD::isUnsignedIntSetCC(CC); +}], X86pcmpm_imm_commute>; + // FIXME: Is there a better scheduler class for VPCMP/VPCMPU? -defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_commute, +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86pcmpm, X86pcmpm_su, + X86pcmpm_commute, X86pcmpm_commute_su, SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; -defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_commute, +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86pcmpum, X86pcmpum_su, + X86pcmpum_commute, X86pcmpum_commute_su, SchedWriteVecALU, avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; -defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_commute, +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86pcmpm, X86pcmpm_su, + X86pcmpm_commute, X86pcmpm_commute_su, SchedWriteVecALU, avx512vl_i16_info, HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_commute, +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86pcmpum, X86pcmpum_su, + X86pcmpum_commute, X86pcmpum_commute_su, SchedWriteVecALU, avx512vl_i16_info, HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_commute, +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86pcmpm, X86pcmpm_su, + X86pcmpm_commute, X86pcmpm_commute_su, SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_commute, +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86pcmpum, X86pcmpum_su, + X86pcmpum_commute, X86pcmpum_commute_su, SchedWriteVecALU, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_commute, +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86pcmpm, X86pcmpm_su, + X86pcmpm_commute, X86pcmpm_commute_su, SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_commute, +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86pcmpum, X86pcmpum_su, + X86pcmpum_commute, X86pcmpum_commute_su, SchedWriteVecALU, avx512vl_i64_info, HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; +def X86cmpm_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (X86cmpm node:$src1, node:$src2, node:$cc), [{ + return N->hasOneUse(); +}]>; +def X86cmpmSAE_su : PatFrag<(ops node:$src1, node:$src2, node:$cc), + (X86cmpmSAE node:$src1, node:$src2, node:$cc), [{ + return N->hasOneUse(); +}]>; + multiclass avx512_vcmp_common { defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc), "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", - (X86cmpm (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc), 1>, - Sched<[sched]>; + (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + 1>, Sched<[sched]>; defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), "vcmp"#_.Suffix, "$cc, $src2, $src1", "$src1, $src2, $cc", - (X86cmpm (_.VT _.RC:$src1), - (_.VT (_.LdFrag addr:$src2)), - imm:$cc)>, + (X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), + imm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)), + imm:$cc)>, Sched<[sched.Folded, sched.ReadAfterFold]>; defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, @@ -2448,7 +2520,10 @@ multiclass avx512_vcmp_common, + imm:$cc), + (X86cmpm_su (_.VT _.RC:$src1), + (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), + imm:$cc)>, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>; // Patterns for selecting with loads in other operand. @@ -2457,9 +2532,9 @@ multiclass avx512_vcmp_common(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, imm:$cc)>; - def : Pat<(and _.KRCWM:$mask, (X86cmpm (_.LdFrag addr:$src2), - (_.VT _.RC:$src1), - CommutableCMPCC:$cc)), + def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (_.LdFrag addr:$src2), + (_.VT _.RC:$src1), + CommutableCMPCC:$cc)), (!cast(Name#_.ZSuffix#"rmik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, imm:$cc)>; @@ -2469,10 +2544,10 @@ multiclass avx512_vcmp_common(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, imm:$cc)>; - def : Pat<(and _.KRCWM:$mask, (X86cmpm (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), - (_.VT _.RC:$src1), - CommutableCMPCC:$cc)), + def : Pat<(and _.KRCWM:$mask, (X86cmpm_su (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), + CommutableCMPCC:$cc)), (!cast(Name#_.ZSuffix#"rmbik") _.KRCWM:$mask, _.RC:$src1, addr:$src2, imm:$cc)>; @@ -2485,8 +2560,8 @@ multiclass avx512_vcmp_sae { "vcmp"#_.Suffix, "$cc, {sae}, $src2, $src1", "$src1, $src2, {sae}, $cc", - (X86cmpmSAE (_.VT _.RC:$src1), - (_.VT _.RC:$src2), + (X86cmpmSAE (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc), + (X86cmpmSAE_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc)>, EVEX_B, Sched<[sched]>; } @@ -5739,6 +5814,7 @@ defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", //===----------------------------------------------------------------------===// multiclass avx512_vptest opc, string OpcodeStr, PatFrag OpNode, + PatFrag OpNode_su, X86FoldableSchedWrite sched, X86VectorVTInfo _, string Name> { let ExeDomain = _.ExeDomain in { @@ -5746,12 +5822,15 @@ multiclass avx512_vptest opc, string OpcodeStr, PatFrag OpNode, defm rr : AVX512_maskable_cmp, + (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV), + (OpNode_su (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)>, EVEX_4V, Sched<[sched]>; defm rm : AVX512_maskable_cmp, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; @@ -5762,13 +5841,14 @@ multiclass avx512_vptest opc, string OpcodeStr, PatFrag OpNode, (_.KVT (!cast(Name # _.ZSuffix # "rr") _.RC:$src, _.RC:$src))>; - def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))), + def : Pat<(_.KVT (and _.KRC:$mask, (OpNode_su _.RC:$src, _.ImmAllZerosV))), (_.KVT (!cast(Name # _.ZSuffix # "rrk") _.KRC:$mask, _.RC:$src, _.RC:$src))>; } multiclass avx512_vptest_mb opc, string OpcodeStr, PatFrag OpNode, - X86FoldableSchedWrite sched, X86VectorVTInfo _> { + PatFrag OpNode_su, X86FoldableSchedWrite sched, + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in defm rmb : AVX512_maskable_cmp opc, string OpcodeStr, PatFrag OpNode, (OpNode (and _.RC:$src1, (X86VBroadcast (_.ScalarLdFrag addr:$src2))), - _.ImmAllZerosV)>, + _.ImmAllZerosV), + (OpNode_su (and _.RC:$src1, + (X86VBroadcast + (_.ScalarLdFrag addr:$src2))), + _.ImmAllZerosV)>, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, Sched<[sched.Folded, sched.ReadAfterFold]>; } // Use 512bit version to implement 128/256 bit in case NoVLX. -multiclass avx512_vptest_lowering { +multiclass avx512_vptest_lowering { def : Pat<(_.KVT (OpNode (and _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)), (_.KVT (COPY_TO_REGCLASS @@ -5796,8 +5881,8 @@ multiclass avx512_vptest_lowering; def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (and _.RC:$src1, _.RC:$src2), - _.ImmAllZerosV))), + (OpNode_su (and _.RC:$src1, _.RC:$src2), + _.ImmAllZerosV))), (COPY_TO_REGCLASS (!cast(Name # "Zrrk") (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), @@ -5816,7 +5901,7 @@ multiclass avx512_vptest_lowering; - def : Pat<(_.KVT (and _.KRC:$mask, (OpNode _.RC:$src, _.ImmAllZerosV))), + def : Pat<(_.KVT (and _.KRC:$mask, (OpNode_su _.RC:$src, _.ImmAllZerosV))), (COPY_TO_REGCLASS (!cast(Name # "Zrrk") (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), @@ -5828,56 +5913,58 @@ multiclass avx512_vptest_lowering opc, string OpcodeStr, PatFrag OpNode, - X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> { + PatFrag OpNode_su, X86SchedWriteWidths sched, + AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in - defm Z : avx512_vptest, - avx512_vptest_mb, EVEX_V512; + defm Z : avx512_vptest, + avx512_vptest_mb, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_vptest, - avx512_vptest_mb, EVEX_V256; - defm Z128 : avx512_vptest, - avx512_vptest_mb, EVEX_V128; + defm Z256 : avx512_vptest, + avx512_vptest_mb, EVEX_V256; + defm Z128 : avx512_vptest, + avx512_vptest_mb, EVEX_V128; } let Predicates = [HasAVX512, NoVLX] in { - defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, NAME>; - defm Z128_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info128, NAME>; + defm Z256_Alt : avx512_vptest_lowering< OpNode, OpNode_su, _.info512, _.info256, NAME>; + defm Z128_Alt : avx512_vptest_lowering< OpNode, OpNode_su, _.info512, _.info128, NAME>; } } multiclass avx512_vptest_dq opc, string OpcodeStr, PatFrag OpNode, - X86SchedWriteWidths sched> { - defm D : avx512_vptest_dq_sizes { + defm D : avx512_vptest_dq_sizes; - defm Q : avx512_vptest_dq_sizes, VEX_W; } multiclass avx512_vptest_wb opc, string OpcodeStr, - PatFrag OpNode, X86SchedWriteWidths sched> { + PatFrag OpNode, PatFrag OpNode_su, + X86SchedWriteWidths sched> { let Predicates = [HasBWI] in { - defm WZ: avx512_vptest, EVEX_V512, VEX_W; - defm BZ: avx512_vptest, EVEX_V512; } let Predicates = [HasVLX, HasBWI] in { - defm WZ256: avx512_vptest, EVEX_V256, VEX_W; - defm WZ128: avx512_vptest, EVEX_V128, VEX_W; - defm BZ256: avx512_vptest, EVEX_V256; - defm BZ128: avx512_vptest, EVEX_V128; } let Predicates = [HasBWI, NoVLX] in { - defm BZ256_Alt : avx512_vptest_lowering; - defm BZ128_Alt : avx512_vptest_lowering; - defm WZ256_Alt : avx512_vptest_lowering; - defm WZ128_Alt : avx512_vptest_lowering; + defm BZ256_Alt : avx512_vptest_lowering; + defm BZ128_Alt : avx512_vptest_lowering; + defm WZ256_Alt : avx512_vptest_lowering; + defm WZ128_Alt : avx512_vptest_lowering; } } @@ -5889,19 +5976,29 @@ def X86pcmpeqm : PatFrag<(ops node:$src1, node:$src2), def X86pcmpnem : PatFrag<(ops node:$src1, node:$src2), (setcc node:$src1, node:$src2, SETNE)>; +def X86pcmpeqm_su : PatFrag<(ops node:$src1, node:$src2), + (X86pcmpeqm node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; +def X86pcmpnem_su : PatFrag<(ops node:$src1, node:$src2), + (X86pcmpnem node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + multiclass avx512_vptest_all_forms opc_wb, bits<8> opc_dq, string OpcodeStr, - PatFrag OpNode, X86SchedWriteWidths sched> : - avx512_vptest_wb, - avx512_vptest_dq; + PatFrag OpNode, PatFrag OpNode_su, + X86SchedWriteWidths sched> : + avx512_vptest_wb, + avx512_vptest_dq; defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86pcmpnem, - SchedWriteVecLogic>, T8PD; + X86pcmpnem_su, SchedWriteVecLogic>, T8PD; defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86pcmpeqm, - SchedWriteVecLogic>, T8XS; + X86pcmpeqm_su, SchedWriteVecLogic>, T8XS; multiclass avx512_vptest_lowering_pats { def : Pat<(_.KVT (OpNode (bitconvert (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), @@ -5909,9 +6006,9 @@ multiclass avx512_vptest_lowering_pats(InstrStr # "rr") _.RC:$src1, _.RC:$src2)>; def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV))), + (OpNode_su (bitconvert + (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), + _.ImmAllZerosV))), (!cast(InstrStr # "rrk") _.KRC:$mask, _.RC:$src1, _.RC:$src2)>; @@ -5922,16 +6019,17 @@ multiclass avx512_vptest_lowering_pats(InstrStr # "rm") _.RC:$src1, addr:$src2)>; def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, - (AndInfo.LdFrag addr:$src2)))), - _.ImmAllZerosV))), + (OpNode_su (bitconvert + (AndInfo.VT (and _.RC:$src1, + (AndInfo.LdFrag addr:$src2)))), + _ .ImmAllZerosV))), (!cast(InstrStr # "rmk") _.KRC:$mask, _.RC:$src1, addr:$src2)>; } // Patterns to use 512-bit instructions when 128/256 are not available. multiclass avx512_vptest_lowering_wide_pats { @@ -5947,9 +6045,9 @@ multiclass avx512_vptest_lowering_wide_pats; def : Pat<(_.KVT (and _.KRC:$mask, - (OpNode (bitconvert - (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), - _.ImmAllZerosV))), + (OpNode_su (bitconvert + (AndInfo.VT (and _.RC:$src1, _.RC:$src2))), + _.ImmAllZerosV))), (COPY_TO_REGCLASS (!cast(InstrStr#"rrk") (COPY_TO_REGCLASS _.KRC:$mask, ExtendInfo.KRC), @@ -5961,62 +6059,63 @@ multiclass avx512_vptest_lowering_wide_pats { let Predicates = [prd, HasVLX] in { - defm : avx512_vptest_lowering_pats; - defm : avx512_vptest_lowering_pats; } let Predicates = [prd] in { - defm : avx512_vptest_lowering_pats; } let Predicates = [prd, NoVLX] in { - defm : avx512_vptest_lowering_wide_pats; - defm : avx512_vptest_lowering_wide_pats; } } -multiclass avx512_vptest_lowering_types { - defm : avx512_vptest_lowering_sizes { + defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; - defm : avx512_vptest_lowering_sizes; } -defm : avx512_vptest_lowering_types<"VPTESTM", X86pcmpnem>; -defm : avx512_vptest_lowering_types<"VPTESTNM", X86pcmpeqm>; +defm : avx512_vptest_lowering_types<"VPTESTM", X86pcmpnem, X86pcmpnem_su>; +defm : avx512_vptest_lowering_types<"VPTESTNM", X86pcmpeqm, X86pcmpeqm_su>; //===----------------------------------------------------------------------===// // AVX-512 Shift instructions @@ -12469,12 +12568,19 @@ defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SchedWriteVecALU, defm : avx512_unary_lowering<"VPOPCNTB", ctpop, avx512vl_i8_info, HasBITALG>; defm : avx512_unary_lowering<"VPOPCNTW", ctpop, avx512vl_i16_info, HasBITALG>; +def X86Vpshufbitqmb_su : PatFrag<(ops node:$src1, node:$src2), + (X86Vpshufbitqmb node:$src1, node:$src2), [{ + return N->hasOneUse(); +}]>; + multiclass VPSHUFBITQMB_rm { defm rr : AVX512_maskable_cmp<0x8F, MRMSrcReg, VTI, (outs VTI.KRC:$dst), (ins VTI.RC:$src1, VTI.RC:$src2), "vpshufbitqmb", "$src2, $src1", "$src1, $src2", (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1), + (VTI.VT VTI.RC:$src2)), + (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1), (VTI.VT VTI.RC:$src2))>, EVEX_4V, T8PD, Sched<[sched]>; defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst), @@ -12482,6 +12588,8 @@ multiclass VPSHUFBITQMB_rm { "vpshufbitqmb", "$src2, $src1", "$src1, $src2", (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1), + (VTI.VT (VTI.LdFrag addr:$src2))), + (X86Vpshufbitqmb_su (VTI.VT VTI.RC:$src1), (VTI.VT (VTI.LdFrag addr:$src2)))>, EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>; diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 59b0d824fc1..0159d9196da 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -9254,7 +9254,7 @@ define i8@test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) ; X86-NEXT: vptestmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vptestmd %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x27,0xc9] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: addb %cl, %al # encoding: [0x00,0xc8] @@ -9265,10 +9265,10 @@ define i8@test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) ; X64: # %bb.0: ; X64-NEXT: vptestmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vptestmd %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x27,0xc9] -; X64-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] -; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] -; X64-NEXT: addb %cl, %al # encoding: [0x00,0xc8] +; X64-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: leal (%rcx,%rax), %eax # encoding: [0x8d,0x04,0x01] ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq # encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) @@ -9313,7 +9313,7 @@ define i8@test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) ; X86-NEXT: vptestmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vptestmq %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x27,0xc9] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: addb %cl, %al # encoding: [0x00,0xc8] @@ -9324,10 +9324,10 @@ define i8@test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) ; X64: # %bb.0: ; X64-NEXT: vptestmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vptestmq %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x27,0xc9] -; X64-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] -; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] -; X64-NEXT: addb %cl, %al # encoding: [0x00,0xc8] +; X64-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: leal (%rcx,%rax), %eax # encoding: [0x8d,0x04,0x01] ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq # encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) @@ -9344,7 +9344,7 @@ define i8@test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) ; X86-NEXT: vptestmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vptestmq %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x27,0xc9] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: addb %cl, %al # encoding: [0x00,0xc8] @@ -9356,10 +9356,10 @@ define i8@test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) ; X64: # %bb.0: ; X64-NEXT: vptestmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vptestmq %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x27,0xc9] -; X64-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] -; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] -; X64-NEXT: addb %cl, %al # encoding: [0x00,0xc8] +; X64-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: leal (%rcx,%rax), %eax # encoding: [0x8d,0x04,0x01] ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] @@ -9377,7 +9377,7 @@ define i8@test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2 ; X86-NEXT: vptestnmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vptestnmd %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x27,0xc9] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: addb %cl, %al # encoding: [0x00,0xc8] @@ -9388,10 +9388,10 @@ define i8@test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2 ; X64: # %bb.0: ; X64-NEXT: vptestnmd %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vptestnmd %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf2,0x7e,0x09,0x27,0xc9] -; X64-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] -; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] -; X64-NEXT: addb %cl, %al # encoding: [0x00,0xc8] +; X64-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: leal (%rcx,%rax), %eax # encoding: [0x8d,0x04,0x01] ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq # encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) @@ -9436,7 +9436,7 @@ define i8@test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2 ; X86-NEXT: vptestnmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vptestnmq %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf2,0xfe,0x09,0x27,0xc9] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: addb %cl, %al # encoding: [0x00,0xc8] @@ -9447,10 +9447,10 @@ define i8@test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2 ; X64: # %bb.0: ; X64-NEXT: vptestnmq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vptestnmq %xmm1, %xmm0, %k1 {%k1} # encoding: [0x62,0xf2,0xfe,0x09,0x27,0xc9] -; X64-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] -; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] -; X64-NEXT: addb %cl, %al # encoding: [0x00,0xc8] +; X64-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: leal (%rcx,%rax), %eax # encoding: [0x8d,0x04,0x01] ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq # encoding: [0xc3] %res = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) @@ -9467,7 +9467,7 @@ define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2 ; X86-NEXT: vptestnmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vptestnmq %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf2,0xfe,0x29,0x27,0xc9] +; X86-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] ; X86-NEXT: addb %cl, %al # encoding: [0x00,0xc8] @@ -9479,10 +9479,10 @@ define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2 ; X64: # %bb.0: ; X64-NEXT: vptestnmq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vptestnmq %ymm1, %ymm0, %k1 {%k1} # encoding: [0x62,0xf2,0xfe,0x29,0x27,0xc9] -; X64-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] -; X64-NEXT: kmovw %k0, %eax # encoding: [0xc5,0xf8,0x93,0xc0] -; X64-NEXT: addb %cl, %al # encoding: [0x00,0xc8] +; X64-NEXT: kandw %k1, %k0, %k1 # encoding: [0xc5,0xfc,0x41,0xc9] +; X64-NEXT: kmovw %k1, %eax # encoding: [0xc5,0xf8,0x93,0xc1] +; X64-NEXT: kmovw %k0, %ecx # encoding: [0xc5,0xf8,0x93,0xc8] +; X64-NEXT: leal (%rcx,%rax), %eax # encoding: [0x8d,0x04,0x01] ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] diff --git a/test/CodeGen/X86/vec_uaddo.ll b/test/CodeGen/X86/vec_uaddo.ll index 93c3954afb7..36dc9311731 100644 --- a/test/CodeGen/X86/vec_uaddo.ll +++ b/test/CodeGen/X86/vec_uaddo.ll @@ -1202,14 +1202,13 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512: # %bb.0: ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 -; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k2 -; AVX512-NEXT: kxnorw %k1, %k0, %k1 -; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 {%k1} +; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 +; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: kandnw %k0, %k1, %k2 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; AVX512-NEXT: kmovd %k2, %eax +; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} +; AVX512-NEXT: kmovd %k1, %eax ; AVX512-NEXT: movb %al, (%rdi) ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.uadd.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) diff --git a/test/CodeGen/X86/vector-compare-all_of.ll b/test/CodeGen/X86/vector-compare-all_of.ll index e79c01f43b3..b05d0935ad1 100644 --- a/test/CodeGen/X86/vector-compare-all_of.ll +++ b/test/CodeGen/X86/vector-compare-all_of.ll @@ -1246,7 +1246,7 @@ define i1 @bool_reduction_v16i8(<16 x i8> %x, <16 x i8> %y) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 ; AVX512-NEXT: kshiftrw $8, %k0, %k1 -; AVX512-NEXT: vpcmpgtb %xmm1, %xmm0, %k0 {%k1} +; AVX512-NEXT: kandw %k0, %k1, %k0 ; AVX512-NEXT: kshiftrw $4, %k0, %k1 ; AVX512-NEXT: kandw %k0, %k1, %k0 ; AVX512-NEXT: kshiftrw $2, %k0, %k1 @@ -1436,7 +1436,7 @@ define i1 @bool_reduction_v16i16(<16 x i16> %x, <16 x i16> %y) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 ; AVX512-NEXT: kshiftrw $8, %k0, %k1 -; AVX512-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 {%k1} +; AVX512-NEXT: kandw %k0, %k1, %k0 ; AVX512-NEXT: kshiftrw $4, %k0, %k1 ; AVX512-NEXT: kandw %k0, %k1, %k0 ; AVX512-NEXT: kshiftrw $2, %k0, %k1 @@ -1497,7 +1497,7 @@ define i1 @bool_reduction_v32i8(<32 x i8> %x, <32 x i8> %y) { ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 ; AVX512-NEXT: kshiftrd $16, %k0, %k1 -; AVX512-NEXT: vpcmpeqb %ymm1, %ymm0, %k0 {%k1} +; AVX512-NEXT: kandd %k0, %k1, %k0 ; AVX512-NEXT: kshiftrd $8, %k0, %k1 ; AVX512-NEXT: kandd %k0, %k1, %k0 ; AVX512-NEXT: kshiftrd $4, %k0, %k1 -- 2.50.1