From: Craig Topper Date: Wed, 16 Jan 2019 21:46:28 +0000 (+0000) Subject: [X86] Use X86ISD::BLENDV for blendv intrinsics. Replace vselect with blendv just... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=0ab0885de0dceb4e23e01665e975f283e5732dc3;p=llvm [X86] Use X86ISD::BLENDV for blendv intrinsics. Replace vselect with blendv just before isel table lookup. Remove vselect isel patterns. This cleans up the duplication we have with both intrinsic isel patterns and vselect isel patterns. This should also allow the intrinsics to get SimplifyDemandedBits support for the condition. I've switched the canonical pattern in isel to use the X86ISD::BLENDV node instead of VSELECT. Since it always seemed weird to move from BLENDV with its relaxed rules on condition bits to VSELECT which has strict rules about all bits of the condition element being the same. Its more correct to go from VSELECT to BLENDV. Differential Revision: https://reviews.llvm.org/D56771 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351380 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 5ac153244df..a08030bb885 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -3381,13 +3381,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) { } break; - case X86ISD::BLENDV: { - // BLENDV selects like a regular VSELECT. - SDValue VSelect = CurDAG->getNode( - ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), + case ISD::VSELECT: { + // Replace VSELECT with non-mask conditions with with BLENDV. + if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1) + break; + + assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); + SDValue Blendv = CurDAG->getNode( + X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), Node->getOperand(1), Node->getOperand(2)); - ReplaceNode(Node, VSelect.getNode()); - SelectCode(VSelect.getNode()); + ReplaceNode(Node, Blendv.getNode()); + SelectCode(Blendv.getNode()); // We already called ReplaceUses. return; } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ece273302e6..504d42ce6a5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -21840,6 +21840,17 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Src1, Src2, Src3), Mask, PassThru, Subtarget, DAG); } + case BLENDV: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + + EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger(); + Src3 = DAG.getBitcast(MaskVT, Src3); + + // Reverse the operands to match VSELECT order. + return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1); + } case VPERM_2OP : { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 910acd80e8b..d2be32a8622 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -204,7 +204,8 @@ namespace llvm { /// Dynamic (non-constant condition) vector blend where only the sign bits /// of the condition elements are used. This is used to enforce that the /// condition mask is not valid for generic VSELECT optimizations. This - /// can also be used to implement the intrinsics. + /// is also used to implement the intrinsics. + /// Operands are in VSELECT order: MASK, TRUE, FALSE BLENDV, /// Combined add and sub on an FP vector. diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 11a27ba9058..bbf2b92bf37 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -448,6 +448,12 @@ def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; +def X86Blendv : SDNode<"X86ISD::BLENDV", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<2, 3>, + SDTCisSameNumEltsAs<0, 1>, + SDTCisSameSizeAs<0, 1>]>>; def X86Addsub : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>; diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index e2bcd18ce66..58aac7951b5 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -6582,16 +6582,16 @@ def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)), VR128:$src2, sub_xmm), 0xf)>; } -/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators -multiclass SS41I_quaternary_int_avx opc, string OpcodeStr, - RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_frag, Intrinsic IntId, - X86FoldableSchedWrite sched> { +/// SS41I_quaternary_vx - AVX SSE 4.1 with 4 operators +multiclass SS41I_quaternary_avx opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT, + PatFrag mem_frag, SDNode OpNode, + X86FoldableSchedWrite sched> { def rr : Ii8Reg, TAPD, VEX_4V, Sched<[sched]>; @@ -6600,8 +6600,8 @@ multiclass SS41I_quaternary_int_avx opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (IntId RC:$src1, (mem_frag addr:$src2), - RC:$src3))], SSEPackedInt>, TAPD, VEX_4V, + (OpNode RC:$src3, (mem_frag addr:$src2), + RC:$src1))], SSEPackedInt>, TAPD, VEX_4V, Sched<[sched.Folded, sched.ReadAfterFold, // x86memop:$src2 ReadDefault, ReadDefault, ReadDefault, ReadDefault, @@ -6612,68 +6612,47 @@ multiclass SS41I_quaternary_int_avx opc, string OpcodeStr, let Predicates = [HasAVX] in { let ExeDomain = SSEPackedDouble in { -defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem, - load, int_x86_sse41_blendvpd, - SchedWriteFVarBlend.XMM>; -defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem, - loadv4f64, int_x86_avx_blendv_pd_256, - SchedWriteFVarBlend.YMM>, VEX_L; +defm VBLENDVPD : SS41I_quaternary_avx<0x4B, "vblendvpd", VR128, f128mem, + v2f64, loadv2f64, X86Blendv, + SchedWriteFVarBlend.XMM>; +defm VBLENDVPDY : SS41I_quaternary_avx<0x4B, "vblendvpd", VR256, f256mem, + v4f64, loadv4f64, X86Blendv, + SchedWriteFVarBlend.YMM>, VEX_L; } // ExeDomain = SSEPackedDouble let ExeDomain = SSEPackedSingle in { -defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem, - load, int_x86_sse41_blendvps, - SchedWriteFVarBlend.XMM>; -defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem, - loadv8f32, int_x86_avx_blendv_ps_256, - SchedWriteFVarBlend.YMM>, VEX_L; +defm VBLENDVPS : SS41I_quaternary_avx<0x4A, "vblendvps", VR128, f128mem, + v4f32, loadv4f32, X86Blendv, + SchedWriteFVarBlend.XMM>; +defm VBLENDVPSY : SS41I_quaternary_avx<0x4A, "vblendvps", VR256, f256mem, + v8f32, loadv8f32, X86Blendv, + SchedWriteFVarBlend.YMM>, VEX_L; } // ExeDomain = SSEPackedSingle -defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem, - load, int_x86_sse41_pblendvb, - SchedWriteVarBlend.XMM>; +defm VPBLENDVB : SS41I_quaternary_avx<0x4C, "vpblendvb", VR128, i128mem, + v16i8, loadv16i8, X86Blendv, + SchedWriteVarBlend.XMM>; } let Predicates = [HasAVX2] in { -defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem, - load, int_x86_avx2_pblendvb, - SchedWriteVarBlend.YMM>, VEX_L; +defm VPBLENDVBY : SS41I_quaternary_avx<0x4C, "vpblendvb", VR256, i256mem, + v32i8, loadv32i8, X86Blendv, + SchedWriteVarBlend.YMM>, VEX_L; } let Predicates = [HasAVX] in { - def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1), - (v4i32 VR128:$src2))), + def : Pat<(v4i32 (X86Blendv (v4i32 VR128:$mask), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1), - (v4f32 VR128:$src2))), - (VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1), - (v2i64 VR128:$src2))), - (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1), - (v2f64 VR128:$src2))), + def : Pat<(v2i64 (X86Blendv (v2i64 VR128:$mask), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), (VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>; - def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1), - (v8i32 VR256:$src2))), - (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1), - (v8f32 VR256:$src2))), + def : Pat<(v8i32 (X86Blendv (v8i32 VR256:$mask), (v8i32 VR256:$src1), + (v8i32 VR256:$src2))), (VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1), - (v4i64 VR256:$src2))), - (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; - def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1), - (v4f64 VR256:$src2))), + def : Pat<(v4i64 (X86Blendv (v4i64 VR256:$mask), (v4i64 VR256:$src1), + (v4i64 VR256:$src2))), (VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>; } -let Predicates = [HasAVX2] in { - def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1), - (v32i8 VR256:$src2))), - (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>; -} - // Prefer a movss or movsd over a blendps when optimizing for size. these were // changed to use blends because blends have better throughput on sandybridge // and haswell, but movs[s/d] are 1-2 byte shorter instructions. @@ -6747,16 +6726,17 @@ let Predicates = [UseSSE41, OptForSpeed] in { } -/// SS41I_ternary_int - SSE 4.1 ternary operator +/// SS41I_ternary - SSE 4.1 ternary operator let Uses = [XMM0], Constraints = "$src1 = $dst" in { - multiclass SS41I_ternary_int opc, string OpcodeStr, PatFrag mem_frag, - X86MemOperand x86memop, Intrinsic IntId, - X86FoldableSchedWrite sched> { + multiclass SS41I_ternary opc, string OpcodeStr, ValueType VT, + PatFrag mem_frag, X86MemOperand x86memop, + SDNode OpNode, X86FoldableSchedWrite sched> { def rr0 : SS48I, + [(set VR128:$dst, + (VT (OpNode XMM0, VR128:$src2, VR128:$src1)))]>, Sched<[sched]>; def rm0 : SS48I, + (OpNode XMM0, (mem_frag addr:$src2), VR128:$src1))]>, Sched<[sched.Folded, sched.ReadAfterFold]>; } } let ExeDomain = SSEPackedDouble in -defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memop, f128mem, - int_x86_sse41_blendvpd, SchedWriteFVarBlend.XMM>; +defm BLENDVPD : SS41I_ternary<0x15, "blendvpd", v2f64, memopv2f64, f128mem, + X86Blendv, SchedWriteFVarBlend.XMM>; let ExeDomain = SSEPackedSingle in -defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memop, f128mem, - int_x86_sse41_blendvps, SchedWriteFVarBlend.XMM>; -defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memop, i128mem, - int_x86_sse41_pblendvb, SchedWriteVarBlend.XMM>; +defm BLENDVPS : SS41I_ternary<0x14, "blendvps", v4f32, memopv4f32, f128mem, + X86Blendv, SchedWriteFVarBlend.XMM>; +defm PBLENDVB : SS41I_ternary<0x10, "pblendvb", v16i8, memopv16i8, i128mem, + X86Blendv, SchedWriteVarBlend.XMM>; // Aliases with the implicit xmm0 argument def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}", @@ -6794,20 +6773,11 @@ def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}", (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>; let Predicates = [UseSSE41] in { - def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1), - (v16i8 VR128:$src2))), - (PBLENDVBrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1), - (v4i32 VR128:$src2))), + def : Pat<(v4i32 (X86Blendv (v4i32 XMM0), (v4i32 VR128:$src1), + (v4i32 VR128:$src2))), (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1), - (v4f32 VR128:$src2))), - (BLENDVPSrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1), - (v2i64 VR128:$src2))), - (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; - def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1), - (v2f64 VR128:$src2))), + def : Pat<(v2i64 (X86Blendv (v2i64 XMM0), (v2i64 VR128:$src1), + (v2i64 VR128:$src2))), (BLENDVPDrr0 VR128:$src2, VR128:$src1)>; } diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 151e1b9136c..37badd85580 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -23,7 +23,7 @@ enum IntrinsicType : uint16_t { GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, INTR_TYPE_3OP_IMM8, - CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, + CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, CVTPD2PS, CVTPD2PS_MASK, CVTPD2PS_RND_MASK, INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, @@ -340,6 +340,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(addcarry_64, ADX, X86ISD::ADC, X86ISD::ADD), X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), X86_INTRINSIC_DATA(avx_addsub_ps_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), + X86_INTRINSIC_DATA(avx_blendv_pd_256, BLENDV, X86ISD::BLENDV, 0), + X86_INTRINSIC_DATA(avx_blendv_ps_256, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx_cmp_pd_256, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0), @@ -369,6 +371,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), @@ -1156,8 +1159,11 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), + X86_INTRINSIC_DATA(sse41_blendvpd, BLENDV, X86ISD::BLENDV, 0), + X86_INTRINSIC_DATA(sse41_blendvps, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse41_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0), X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),