From 5a5e8a7eb6c51bef04ba60a404236e2bc97894a1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 30 Jan 2017 00:06:01 +0000 Subject: [PATCH] [AVX-512] Don't reuse VSHLI/VSRLI for mask register shifts. VSHLI/VSHRI shift within elements while KSHIFT moves whole elements. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293448 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 42 ++++++++++++------------- lib/Target/X86/X86ISelLowering.h | 3 ++ lib/Target/X86/X86InstrAVX512.td | 8 ++--- lib/Target/X86/X86InstrFragmentsSIMD.td | 9 ++++++ 4 files changed, 37 insertions(+), 25 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e5ff4dc8828..febc45abc4c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5021,7 +5021,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (Vec.isUndef()) { if (IdxVal != 0) { SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8); - WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits); + WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, + ShiftBits); } return ExtractSubVec(WideSubVec); } @@ -5030,9 +5031,9 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, NumElems = WideOpVT.getVectorNumElements(); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, - DAG.getConstant(ShiftLeft, dl, MVT::i8)); - Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec; return ExtractSubVec(Vec); } @@ -5041,8 +5042,8 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Zero lower bits of the Vec SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); // Merge them together, SubVec should be zero extended. WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, getZeroVector(WideOpVT, Subtarget, DAG, dl), @@ -5054,12 +5055,12 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { // Zero upper bits of the Vec - WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, + WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, DAG.getConstant(IdxVal, dl, MVT::i8)); SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec); return ExtractSubVec(Vec); } @@ -13658,9 +13659,9 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const } unsigned MaxSift = VecVT.getVectorNumElements() - 1; if (MaxSift - IdxVal) - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(MaxSift, dl, MVT::i8)); return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec, DAG.getIntPtrConstant(0, dl)); @@ -13802,7 +13803,7 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { if(Vec.isUndef()) { if (IdxVal) - EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); return EltInVec; } @@ -13812,21 +13813,21 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { if (IdxVal == 0 ) { // EltInVec already at correct index and other bits are 0. // Clean the first bit in source vector. - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(1, dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } if (IdxVal == NumElems -1) { // Move the bit to the last position inside the vector. - EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec, + EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); // Clean the last bit in the source vector. - Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, DAG.getConstant(1, dl, MVT::i8)); - Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec, + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, DAG.getConstant(1 , dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); @@ -23945,6 +23946,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::TESTNM: return "X86ISD::TESTNM"; case X86ISD::KORTEST: return "X86ISD::KORTEST"; case X86ISD::KTEST: return "X86ISD::KTEST"; + case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL"; + case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR"; case X86ISD::PACKSS: return "X86ISD::PACKSS"; case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; @@ -30494,10 +30497,7 @@ static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG, bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode; EVT VT = N->getValueType(0); unsigned NumBitsPerElt = VT.getScalarSizeInBits(); - - // This fails for mask register (vXi1) shifts. - if ((NumBitsPerElt % 8) != 0) - return SDValue(); + assert((NumBitsPerElt % 8) == 0); // Out of range logical bit shifts are guaranteed to be zero. // Out of range arithmetic bit shifts splat the sign bit. diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 37f9353042b..9969c909347 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -320,6 +320,9 @@ namespace llvm { // Vector shift elements by immediate VSHLI, VSRLI, VSRAI, + // Shifts of mask registers. + KSHIFTL, KSHIFTR, + // Bit rotate by immediate VROTLI, VROTRI, diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index cbcaac1aac8..38719ad93b4 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2536,8 +2536,8 @@ multiclass avx512_mask_shiftop_w opc1, bits<8> opc2, string OpcodeStr, } } -defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86vshli>; -defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86vsrli>; +defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>; +defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>; // Mask setting all 0s or 1s multiclass avx512_mask_setop { @@ -2618,12 +2618,12 @@ def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))), // Patterns for kmask shift multiclass mask_shift_lowering { - def : Pat<(VT (X86vshli RC:$src, (i8 imm:$imm))), + def : Pat<(VT (X86kshiftl RC:$src, (i8 imm:$imm))), (VT (COPY_TO_REGCLASS (KSHIFTLWri (COPY_TO_REGCLASS RC:$src, VK16), (I8Imm $imm)), RC))>; - def : Pat<(VT (X86vsrli RC:$src, (i8 imm:$imm))), + def : Pat<(VT (X86kshiftr RC:$src, (i8 imm:$imm))), (VT (COPY_TO_REGCLASS (KSHIFTRWri (COPY_TO_REGCLASS RC:$src, VK16), (I8Imm $imm)), diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index c5689d7c698..196ba39e8ac 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -200,6 +200,15 @@ def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>; def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; +def X86kshiftl : SDNode<"X86ISD::KSHIFTL", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>, + SDTCisSameAs<0, 1>, + SDTCisVT<2, i8>]>>; +def X86kshiftr : SDNode<"X86ISD::KSHIFTR", + SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>, + SDTCisSameAs<0, 1>, + SDTCisVT<2, i8>]>>; + def X86vrotli : SDNode<"X86ISD::VROTLI", SDTIntShiftOp>; def X86vrotri : SDNode<"X86ISD::VROTRI", SDTIntShiftOp>; -- 2.50.1