From: Ulrich Weigand Date: Fri, 4 Aug 2017 18:57:58 +0000 (+0000) Subject: [SystemZ] Add support for 128-bit atomic load/store/cmpxchg X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e6a2101d1d25ddac207faccdc474f66e758b9924;p=llvm [SystemZ] Add support for 128-bit atomic load/store/cmpxchg This adds support for the main 128-bit atomic operations, using the SystemZ instructions LPQ, STPQ, and CDSG. Generating these instructions is a bit more complex than usual since the i128 type is not legal for the back-end. Therefore, we have to hook the LowerOperationWrapper and ReplaceNodeResults TargetLowering callbacks. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310094 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 42d4a8d3185..dc0de5fd3c7 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -222,6 +222,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); + // Even though i128 is not a legal type, we still need to custom lower + // the atomic operations in order to exploit SystemZ instructions. + setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom); + setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); // Traps are legal, as we will convert them to "j .+2". @@ -4789,6 +4795,88 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, } } +// Lower operations with invalid operand or result types (currently used +// only for 128-bit integer types). + +static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) { + SDLoc DL(In); + SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In, + DAG.getIntPtrConstant(0, DL)); + SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In, + DAG.getIntPtrConstant(1, DL)); + SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL, + MVT::Untyped, Hi, Lo); + return SDValue(Pair, 0); +} + +static SDValue lowerGR128ToI128(SelectionDAG &DAG, SDValue In) { + SDLoc DL(In); + SDValue Hi = DAG.getTargetExtractSubreg(SystemZ::subreg_h64, + DL, MVT::i64, In); + SDValue Lo = DAG.getTargetExtractSubreg(SystemZ::subreg_l64, + DL, MVT::i64, In); + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi); +} + +void +SystemZTargetLowering::LowerOperationWrapper(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const { + switch (N->getOpcode()) { + case ISD::ATOMIC_LOAD: { + SDLoc DL(N); + SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::Other); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1) }; + MachineMemOperand *MMO = cast(N)->getMemOperand(); + SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_LOAD_128, + DL, Tys, Ops, MVT::i128, MMO); + Results.push_back(lowerGR128ToI128(DAG, Res)); + Results.push_back(Res.getValue(1)); + break; + } + case ISD::ATOMIC_STORE: { + SDLoc DL(N); + SDVTList Tys = DAG.getVTList(MVT::Other); + SDValue Ops[] = { N->getOperand(0), + lowerI128ToGR128(DAG, N->getOperand(2)), + N->getOperand(1) }; + MachineMemOperand *MMO = cast(N)->getMemOperand(); + SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_STORE_128, + DL, Tys, Ops, MVT::i128, MMO); + // We have to enforce sequential consistency by performing a + // serialization operation after the store. + if (cast(N)->getOrdering() == + AtomicOrdering::SequentiallyConsistent) + Res = SDValue(DAG.getMachineNode(SystemZ::Serialize, DL, + MVT::Other, Res), 0); + Results.push_back(Res); + break; + } + case ISD::ATOMIC_CMP_SWAP: { + SDLoc DL(N); + SDVTList Tys = DAG.getVTList(MVT::Untyped, MVT::Other); + SDValue Ops[] = { N->getOperand(0), N->getOperand(1), + lowerI128ToGR128(DAG, N->getOperand(2)), + lowerI128ToGR128(DAG, N->getOperand(3)) }; + MachineMemOperand *MMO = cast(N)->getMemOperand(); + SDValue Res = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAP_128, + DL, Tys, Ops, MVT::i128, MMO); + Results.push_back(lowerGR128ToI128(DAG, Res)); + Results.push_back(Res.getValue(1)); + break; + } + default: + llvm_unreachable("Unexpected node to lower"); + } +} + +void +SystemZTargetLowering::ReplaceNodeResults(SDNode *N, + SmallVectorImpl &Results, + SelectionDAG &DAG) const { + return LowerOperationWrapper(N, Results, DAG); +} + const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { #define OPCODE(NAME) case SystemZISD::NAME: return "SystemZISD::" #NAME switch ((SystemZISD::NodeType)Opcode) { @@ -4889,6 +4977,9 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(ATOMIC_LOADW_UMIN); OPCODE(ATOMIC_LOADW_UMAX); OPCODE(ATOMIC_CMP_SWAPW); + OPCODE(ATOMIC_LOAD_128); + OPCODE(ATOMIC_STORE_128); + OPCODE(ATOMIC_CMP_SWAP_128); OPCODE(LRV); OPCODE(STRV); OPCODE(PREFETCH); @@ -5916,6 +6007,32 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI, return DoneMBB; } +// Emit a move from two GR64s to a GR128. +MachineBasicBlock * +SystemZTargetLowering::emitPair128(MachineInstr &MI, + MachineBasicBlock *MBB) const { + MachineFunction &MF = *MBB->getParent(); + const SystemZInstrInfo *TII = + static_cast(Subtarget.getInstrInfo()); + MachineRegisterInfo &MRI = MF.getRegInfo(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dest = MI.getOperand(0).getReg(); + unsigned Hi = MI.getOperand(1).getReg(); + unsigned Lo = MI.getOperand(2).getReg(); + unsigned Tmp1 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass); + unsigned Tmp2 = MRI.createVirtualRegister(&SystemZ::GR128BitRegClass); + + BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Tmp1); + BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Tmp2) + .addReg(Tmp1).addReg(Hi).addImm(SystemZ::subreg_h64); + BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest) + .addReg(Tmp2).addReg(Lo).addImm(SystemZ::subreg_l64); + + MI.eraseFromParent(); + return MBB; +} + // Emit an extension from a GR64 to a GR128. ClearEven is true // if the high register of the GR128 value must be cleared or false if // it's "don't care". @@ -6309,6 +6426,8 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( case SystemZ::CondStoreF64Inv: return emitCondStore(MI, MBB, SystemZ::STD, 0, true); + case SystemZ::PAIR128: + return emitPair128(MI, MBB); case SystemZ::AEXT128: return emitExt128(MI, MBB, false); case SystemZ::ZEXT128: diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index a59f507477d..ed5786dcd42 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -308,6 +308,18 @@ enum NodeType : unsigned { // Operand 5: the width of the field in bits (8 or 16) ATOMIC_CMP_SWAPW, + // 128-bit atomic load. + // Val, OUTCHAIN = ATOMIC_LOAD_128(INCHAIN, ptr) + ATOMIC_LOAD_128, + + // 128-bit atomic store. + // OUTCHAIN = ATOMIC_STORE_128(INCHAIN, val, ptr) + ATOMIC_STORE_128, + + // 128-bit atomic compare-and-swap. + // Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) + ATOMIC_CMP_SWAP_128, + // Byte swapping load. // // Operand 0: the address to load from @@ -449,6 +461,10 @@ public: EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; + void LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, + SelectionDAG &DAG) const override; + void ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, + SelectionDAG &DAG) const override; bool allowTruncateForTailCall(Type *, Type *) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, @@ -566,6 +582,8 @@ private: MachineBasicBlock *emitCondStore(MachineInstr &MI, MachineBasicBlock *BB, unsigned StoreOpcode, unsigned STOCOpcode, bool Invert) const; + MachineBasicBlock *emitPair128(MachineInstr &MI, + MachineBasicBlock *MBB) const; MachineBasicBlock *emitExt128(MachineInstr &MI, MachineBasicBlock *MBB, bool ClearEven) const; MachineBasicBlock *emitAtomicLoadBinary(MachineInstr &MI, diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index f64c0d15ef8..766d07e8d89 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -372,6 +372,9 @@ let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in { def LTGR : UnaryRRE<"ltgr", 0xB902, null_frag, GR64, GR64>; } +let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in + def PAIR128 : Pseudo<(outs GR128:$dst), (ins GR64:$hi, GR64:$lo), []>; + // Immediate moves. let hasSideEffects = 0, isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in { @@ -1721,7 +1724,7 @@ let Defs = [CC] in { // Compare double and swap. let Defs = [CC] in { defm CDS : CmpSwapRSPair<"cds", 0xBB, 0xEB31, null_frag, GR128>; - def CDSG : CmpSwapRSY<"cdsg", 0xEB3E, null_frag, GR128>; + def CDSG : CmpSwapRSY<"cdsg", 0xEB3E, z_atomic_cmp_swap_128, GR128>; } // Compare and swap and store. @@ -1733,8 +1736,8 @@ let Uses = [R0L, R1D], Defs = [CC], mayStore = 1, mayLoad =1 in def PLO : SideEffectQuaternarySSe<"plo", 0xEE, GR64>; // Load/store pair from/to quadword. -def LPQ : UnaryRXY<"lpq", 0xE38F, null_frag, GR128, 16>; -def STPQ : StoreRXY<"stpq", 0xE38E, null_frag, GR128, 16>; +def LPQ : UnaryRXY<"lpq", 0xE38F, z_atomic_load_128, GR128, 16>; +def STPQ : StoreRXY<"stpq", 0xE38E, z_atomic_store_128, GR128, 16>; // Load pair disjoint. let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in { diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index 759a8bb0ce1..570218254f8 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -55,6 +55,17 @@ def SDT_ZAtomicCmpSwapW : SDTypeProfile<1, 6, SDTCisVT<4, i32>, SDTCisVT<5, i32>, SDTCisVT<6, i32>]>; +def SDT_ZAtomicLoad128 : SDTypeProfile<1, 1, + [SDTCisVT<0, untyped>, + SDTCisPtrTy<1>]>; +def SDT_ZAtomicStore128 : SDTypeProfile<0, 2, + [SDTCisVT<0, untyped>, + SDTCisPtrTy<1>]>; +def SDT_ZAtomicCmpSwap128 : SDTypeProfile<1, 3, + [SDTCisVT<0, untyped>, + SDTCisPtrTy<1>, + SDTCisVT<2, untyped>, + SDTCisVT<3, untyped>]>; def SDT_ZMemMemLength : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, @@ -287,6 +298,17 @@ def z_atomic_loadw_umin : AtomicWOp<"ATOMIC_LOADW_UMIN">; def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">; def z_atomic_cmp_swapw : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>; +def z_atomic_load_128 : SDNode<"SystemZISD::ATOMIC_LOAD_128", + SDT_ZAtomicLoad128, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def z_atomic_store_128 : SDNode<"SystemZISD::ATOMIC_STORE_128", + SDT_ZAtomicStore128, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; +def z_atomic_cmp_swap_128 : SDNode<"SystemZISD::ATOMIC_CMP_SWAP_128", + SDT_ZAtomicCmpSwap128, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; + def z_mvc : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength, [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>; def z_mvc_loop : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop, diff --git a/test/CodeGen/SystemZ/atomic-load-05.ll b/test/CodeGen/SystemZ/atomic-load-05.ll new file mode 100644 index 00000000000..c527184ff23 --- /dev/null +++ b/test/CodeGen/SystemZ/atomic-load-05.ll @@ -0,0 +1,13 @@ +; Test 128-bit atomic loads. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +define i128 @f1(i128 *%src) { +; CHECK-LABEL: f1: +; CHECK: lpq %r0, 0(%r3) +; CHECK-DAG: stg %r1, 8(%r2) +; CHECK-DAG: stg %r0, 0(%r2) +; CHECK: br %r14 + %val = load atomic i128, i128 *%src seq_cst, align 16 + ret i128 %val +} diff --git a/test/CodeGen/SystemZ/atomic-store-05.ll b/test/CodeGen/SystemZ/atomic-store-05.ll new file mode 100644 index 00000000000..e0ea660852b --- /dev/null +++ b/test/CodeGen/SystemZ/atomic-store-05.ll @@ -0,0 +1,25 @@ +; Test 128-bit atomic stores. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +define void @f1(i128 %val, i128 *%src) { +; CHECK-LABEL: f1: +; CHECK-DAG: lg %r1, 8(%r2) +; CHECK-DAG: lg %r0, 0(%r2) +; CHECK: stpq %r0, 0(%r3) +; CHECK: bcr 1{{[45]}}, %r0 +; CHECK: br %r14 + store atomic i128 %val, i128 *%src seq_cst, align 16 + ret void +} + +define void @f2(i128 %val, i128 *%src) { +; CHECK-LABEL: f2: +; CHECK-DAG: lg %r1, 8(%r2) +; CHECK-DAG: lg %r0, 0(%r2) +; CHECK: stpq %r0, 0(%r3) +; CHECK-NOT: bcr 1{{[45]}}, %r0 +; CHECK: br %r14 + store atomic i128 %val, i128 *%src monotonic, align 16 + ret void +} diff --git a/test/CodeGen/SystemZ/cmpxchg-06.ll b/test/CodeGen/SystemZ/cmpxchg-06.ll new file mode 100644 index 00000000000..da565791c7c --- /dev/null +++ b/test/CodeGen/SystemZ/cmpxchg-06.ll @@ -0,0 +1,113 @@ +; Test 64-bit compare and swap. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s + +; Check CDSG without a displacement. +define i128 @f1(i128 %cmp, i128 %swap, i128 *%src) { +; CHECK-LABEL: f1: +; CHECK-DAG: lg %r1, 8(%r4) +; CHECK-DAG: lg %r0, 0(%r4) +; CHECK-DAG: lg %r13, 8(%r3) +; CHECK-DAG: lg %r12, 0(%r3) +; CHECK: cdsg %r12, %r0, 0(%r5) +; CHECK-DAG: stg %r13, 8(%r2) +; CHECK-DAG: stg %r12, 0(%r2) +; CHECK: br %r14 + %pairval = cmpxchg i128 *%src, i128 %cmp, i128 %swap seq_cst seq_cst + %val = extractvalue { i128, i1 } %pairval, 0 + ret i128 %val +} + +; Check the high end of the aligned CDSG range. +define i128 @f2(i128 %cmp, i128 %swap, i128 *%src) { +; CHECK-LABEL: f2: +; CHECK: cdsg {{%r[0-9]+}}, {{%r[0-9]+}}, 524272(%r5) +; CHECK: br %r14 + %ptr = getelementptr i128, i128 *%src, i128 32767 + %pairval = cmpxchg i128 *%ptr, i128 %cmp, i128 %swap seq_cst seq_cst + %val = extractvalue { i128, i1 } %pairval, 0 + ret i128 %val +} + +; Check the next doubleword up, which needs separate address logic. +; Other sequences besides this one would be OK. +define i128 @f3(i128 %cmp, i128 %swap, i128 *%src) { +; CHECK-LABEL: f3: +; CHECK: agfi %r5, 524288 +; CHECK: cdsg {{%r[0-9]+}}, {{%r[0-9]+}}, 0(%r5) +; CHECK: br %r14 + %ptr = getelementptr i128, i128 *%src, i128 32768 + %pairval = cmpxchg i128 *%ptr, i128 %cmp, i128 %swap seq_cst seq_cst + %val = extractvalue { i128, i1 } %pairval, 0 + ret i128 %val +} + +; Check the high end of the negative aligned CDSG range. +define i128 @f4(i128 %cmp, i128 %swap, i128 *%src) { +; CHECK-LABEL: f4: +; CHECK: cdsg {{%r[0-9]+}}, {{%r[0-9]+}}, -16(%r5) +; CHECK: br %r14 + %ptr = getelementptr i128, i128 *%src, i128 -1 + %pairval = cmpxchg i128 *%ptr, i128 %cmp, i128 %swap seq_cst seq_cst + %val = extractvalue { i128, i1 } %pairval, 0 + ret i128 %val +} + +; Check the low end of the CDSG range. +define i128 @f5(i128 %cmp, i128 %swap, i128 *%src) { +; CHECK-LABEL: f5: +; CHECK: cdsg {{%r[0-9]+}}, {{%r[0-9]+}}, -524288(%r5) +; CHECK: br %r14 + %ptr = getelementptr i128, i128 *%src, i128 -32768 + %pairval = cmpxchg i128 *%ptr, i128 %cmp, i128 %swap seq_cst seq_cst + %val = extractvalue { i128, i1 } %pairval, 0 + ret i128 %val +} + +; Check the next doubleword down, which needs separate address logic. +; Other sequences besides this one would be OK. +define i128 @f6(i128 %cmp, i128 %swap, i128 *%src) { +; CHECK-LABEL: f6: +; CHECK: agfi %r5, -524304 +; CHECK: cdsg {{%r[0-9]+}}, {{%r[0-9]+}}, 0(%r5) +; CHECK: br %r14 + %ptr = getelementptr i128, i128 *%src, i128 -32769 + %pairval = cmpxchg i128 *%ptr, i128 %cmp, i128 %swap seq_cst seq_cst + %val = extractvalue { i128, i1 } %pairval, 0 + ret i128 %val +} + +; Check that CDSG does not allow an index. +define i128 @f7(i128 %cmp, i128 %swap, i64 %src, i64 %index) { +; CHECK-LABEL: f7: +; CHECK: agr %r5, %r6 +; CHECK: cdsg {{%r[0-9]+}}, {{%r[0-9]+}}, 0(%r5) +; CHECK: br %r14 + %add1 = add i64 %src, %index + %ptr = inttoptr i64 %add1 to i128 * + %pairval = cmpxchg i128 *%ptr, i128 %cmp, i128 %swap seq_cst seq_cst + %val = extractvalue { i128, i1 } %pairval, 0 + ret i128 %val +} + +; Check that a constant %cmp value is loaded into a register first. +define i128 @f8(i128 %swap, i128 *%ptr) { +; CHECK-LABEL: f8: +; CHECK: lghi {{%r[0-9]+}}, 1001 +; CHECK: cdsg {{%r[0-9]+}}, {{%r[0-9]+}}, 0(%r4) +; CHECK: br %r14 + %pairval = cmpxchg i128 *%ptr, i128 1001, i128 %swap seq_cst seq_cst + %val = extractvalue { i128, i1 } %pairval, 0 + ret i128 %val +} + +; Check that a constant %swap value is loaded into a register first. +define i128 @f9(i128 %cmp, i128 *%ptr) { +; CHECK-LABEL: f9: +; CHECK: lghi {{%r[0-9]+}}, 1002 +; CHECK: cdsg {{%r[0-9]+}}, {{%r[0-9]+}}, 0(%r4) +; CHECK: br %r14 + %pairval = cmpxchg i128 *%ptr, i128 %cmp, i128 1002 seq_cst seq_cst + %val = extractvalue { i128, i1 } %pairval, 0 + ret i128 %val +}