From 538287dea2e6ae17a0f81615097c4de0df5d1e4d Mon Sep 17 00:00:00 2001 From: Ulrich Weigand Date: Tue, 5 May 2015 19:29:21 +0000 Subject: [PATCH] [SystemZ] Handle sub-128 vectors The ABI allows sub-128 vectors to be passed and returned in registers, with the vector occupying the upper part of a register. We therefore want to legalize those types by widening the vector rather than promoting the elements. The patch includes some simple tests for sub-128 vectors and also tests that we can recognize various pack sequences, some of which use sub-128 vectors as temporary results. One of these forms is based on the pack sequences generated by llvmpipe when no intrinsics are used. Signed unpacks are recognized as BUILD_VECTORs whose elements are individually sign-extended. Unsigned unpacks can have the equivalent form with zero extension, but they also occur as shuffles in which some elements are zero. Based on a patch by Richard Sandiford. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@236525 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/SystemZ/SystemZCallingConv.h | 17 + lib/Target/SystemZ/SystemZCallingConv.td | 17 +- lib/Target/SystemZ/SystemZISelLowering.cpp | 81 +++- lib/Target/SystemZ/SystemZISelLowering.h | 28 ++ lib/Target/SystemZ/SystemZInstrVector.td | 24 +- lib/Target/SystemZ/SystemZOperators.td | 15 +- test/CodeGen/SystemZ/vec-args-03.ll | 14 + test/CodeGen/SystemZ/vec-args-04.ll | 50 +++ test/CodeGen/SystemZ/vec-args-05.ll | 32 ++ test/CodeGen/SystemZ/vec-combine-01.ll | 48 +++ test/CodeGen/SystemZ/vec-combine-02.ll | 433 +++++++++++++++++++++ test/CodeGen/SystemZ/vec-const-01.ll | 48 +++ test/CodeGen/SystemZ/vec-const-02.ll | 32 ++ test/CodeGen/SystemZ/vec-const-03.ll | 16 + test/CodeGen/SystemZ/vec-const-05.ll | 16 + test/CodeGen/SystemZ/vec-move-01.ll | 56 +++ test/CodeGen/SystemZ/vec-move-13.ll | 4 +- test/CodeGen/SystemZ/vec-move-15.ll | 106 +++++ test/CodeGen/SystemZ/vec-move-16.ll | 106 +++++ test/CodeGen/SystemZ/vec-sub-01.ll | 61 +++ 20 files changed, 1175 insertions(+), 29 deletions(-) create mode 100644 test/CodeGen/SystemZ/vec-args-04.ll create mode 100644 test/CodeGen/SystemZ/vec-args-05.ll create mode 100644 test/CodeGen/SystemZ/vec-combine-02.ll create mode 100644 test/CodeGen/SystemZ/vec-move-15.ll create mode 100644 test/CodeGen/SystemZ/vec-move-16.ll diff --git a/lib/Target/SystemZ/SystemZCallingConv.h b/lib/Target/SystemZ/SystemZCallingConv.h index 8b8146762b6..bff0706618a 100644 --- a/lib/Target/SystemZ/SystemZCallingConv.h +++ b/lib/Target/SystemZ/SystemZCallingConv.h @@ -28,6 +28,14 @@ private: /// See ISD::OutputArg::IsFixed. SmallVector ArgIsFixed; + /// Records whether the value was widened from a short vector type. + SmallVector ArgIsShortVector; + + // Check whether ArgVT is a short vector type. + bool IsShortVectorType(EVT ArgVT) { + return ArgVT.isVector() && ArgVT.getStoreSize() <= 8; + } + public: SystemZCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF, SmallVectorImpl &locs, LLVMContext &C) @@ -39,6 +47,10 @@ public: ArgIsFixed.clear(); for (unsigned i = 0; i < Ins.size(); ++i) ArgIsFixed.push_back(true); + // Record whether the call operand was a short vector. + ArgIsShortVector.clear(); + for (unsigned i = 0; i < Ins.size(); ++i) + ArgIsShortVector.push_back(IsShortVectorType(Ins[i].ArgVT)); CCState::AnalyzeFormalArguments(Ins, Fn); } @@ -49,6 +61,10 @@ public: ArgIsFixed.clear(); for (unsigned i = 0; i < Outs.size(); ++i) ArgIsFixed.push_back(Outs[i].IsFixed); + // Record whether the call operand was a short vector. + ArgIsShortVector.clear(); + for (unsigned i = 0; i < Outs.size(); ++i) + ArgIsShortVector.push_back(IsShortVectorType(Outs[i].ArgVT)); CCState::AnalyzeCallOperands(Outs, Fn); } @@ -60,6 +76,7 @@ public: CCAssignFn Fn) = delete; bool IsFixed(unsigned ValNo) { return ArgIsFixed[ValNo]; } + bool IsShortVector(unsigned ValNo) { return ArgIsShortVector[ValNo]; } }; } // end namespace llvm diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td index a2f996e60df..be8f00b57ad 100644 --- a/lib/Target/SystemZ/SystemZCallingConv.td +++ b/lib/Target/SystemZ/SystemZCallingConv.td @@ -21,6 +21,11 @@ class CCIfSubtarget class CCIfFixed : CCIf<"static_cast(&State)->IsFixed(ValNo)", A>; +// Match if this specific argument was widened from a short vector type. +class CCIfShortVector + : CCIf<"static_cast(&State)->IsShortVector(ValNo)", A>; + + //===----------------------------------------------------------------------===// // z/Linux return value calling convention //===----------------------------------------------------------------------===// @@ -43,6 +48,8 @@ def RetCC_SystemZ : CallingConv<[ CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>, // Similarly for vectors, with V24 being the ABI-compliant choice. + // Sub-128 vectors are returned in the same way, but they're widened + // to one of these types during type legalization. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>> @@ -74,12 +81,20 @@ def CC_SystemZ : CallingConv<[ CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>, CCIfType<[f64], CCAssignToReg<[F0D, F2D, F4D, F6D]>>, - // The first 8 named vector arguments are passed in V24-V31. + // The first 8 named vector arguments are passed in V24-V31. Sub-128 vectors + // are passed in the same way, but they're widened to one of these types + // during type legalization. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfFixed>>>, + // However, sub-128 vectors which need to go on the stack occupy just a + // single 8-byte-aligned 8-byte stack slot. Pass as i64. + CCIfSubtarget<"hasVector()", + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfShortVector>>>, + // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots. CCIfSubtarget<"hasVector()", CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index ff79a48179f..c3842519008 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -318,6 +318,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &tm, // Convert a GPR scalar to a vector by inserting it into element 0. setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); + // Use a series of unpacks for extensions. + setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); + setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); + // Detect shifts by a scalar amount and convert them into // V*_BY_SCALAR. setOperationAction(ISD::SHL, VT, Custom); @@ -793,7 +797,15 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL, else if (VA.getLocInfo() == CCValAssign::Indirect) Value = DAG.getLoad(VA.getValVT(), DL, Chain, Value, MachinePointerInfo(), false, false, false, 0); - else + else if (VA.getLocInfo() == CCValAssign::BCvt) { + // If this is a short vector argument loaded from the stack, + // extend from i64 to full vector size and then bitcast. + assert(VA.getLocVT() == MVT::i64); + assert(VA.getValVT().isVector()); + Value = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i64, + Value, DAG.getUNDEF(MVT::i64)); + Value = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Value); + } else assert(VA.getLocInfo() == CCValAssign::Full && "Unsupported getLocInfo"); return Value; } @@ -810,6 +822,14 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL, return DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Value); case CCValAssign::AExt: return DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Value); + case CCValAssign::BCvt: + // If this is a short vector argument to be stored to the stack, + // bitcast to v2i64 and then extract first element. + assert(VA.getLocVT() == MVT::i64); + assert(VA.getValVT().isVector()); + Value = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VA.getLocVT(), Value, + DAG.getConstant(0, DL, MVT::i32)); case CCValAssign::Full: return Value; default: @@ -3910,6 +3930,23 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, return DAG.getNode(ISD::BITCAST, DL, VT, Res); } +SDValue +SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, + unsigned UnpackHigh) const { + SDValue PackedOp = Op.getOperand(0); + EVT OutVT = Op.getValueType(); + EVT InVT = PackedOp.getValueType(); + unsigned ToBits = OutVT.getVectorElementType().getSizeInBits(); + unsigned FromBits = InVT.getVectorElementType().getSizeInBits(); + do { + FromBits *= 2; + EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits), + SystemZ::VectorBits / FromBits); + PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp); + } while (FromBits != ToBits); + return PackedOp; +} + SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const { // Look for cases where a vector shift can use the *_BY_SCALAR form. @@ -4058,6 +4095,10 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op, return lowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::SIGN_EXTEND_VECTOR_INREG: + return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH); + case ISD::ZERO_EXTEND_VECTOR_INREG: + return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH); case ISD::SHL: return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR); case ISD::SRL: @@ -4122,6 +4163,10 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(PERMUTE_DWORDS); OPCODE(PERMUTE); OPCODE(PACK); + OPCODE(UNPACK_HIGH); + OPCODE(UNPACKL_HIGH); + OPCODE(UNPACK_LOW); + OPCODE(UNPACKL_LOW); OPCODE(VSHL_BY_SCALAR); OPCODE(VSRL_BY_SCALAR); OPCODE(VSRA_BY_SCALAR); @@ -4334,17 +4379,35 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N, } } } - // (z_merge_high 0, 0) -> 0. This is mostly useful for using VLLEZF - // for v4f32. - if (Opcode == SystemZISD::MERGE_HIGH) { + if (Opcode == SystemZISD::MERGE_HIGH || + Opcode == SystemZISD::MERGE_LOW) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); - if (Op0 == Op1) { - if (Op0.getOpcode() == ISD::BITCAST) - Op0 = Op0.getOperand(0); - if (Op0.getOpcode() == SystemZISD::BYTE_MASK && - cast(Op0.getOperand(0))->getZExtValue() == 0) + if (Op0.getOpcode() == ISD::BITCAST) + Op0 = Op0.getOperand(0); + if (Op0.getOpcode() == SystemZISD::BYTE_MASK && + cast(Op0.getOperand(0))->getZExtValue() == 0) { + // (z_merge_* 0, 0) -> 0. This is mostly useful for using VLLEZF + // for v4f32. + if (Op1 == N->getOperand(0)) return Op1; + // (z_merge_? 0, X) -> (z_unpackl_? 0, X). + EVT VT = Op1.getValueType(); + unsigned ElemBytes = VT.getVectorElementType().getStoreSize(); + if (ElemBytes <= 4) { + Opcode = (Opcode == SystemZISD::MERGE_HIGH ? + SystemZISD::UNPACKL_HIGH : SystemZISD::UNPACKL_LOW); + EVT InVT = VT.changeVectorElementTypeToInteger(); + EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(ElemBytes * 16), + SystemZ::VectorBytes / ElemBytes / 2); + if (VT != InVT) { + Op1 = DAG.getNode(ISD::BITCAST, SDLoc(N), InVT, Op1); + DCI.AddToWorklist(Op1.getNode()); + } + SDValue Op = DAG.getNode(Opcode, SDLoc(N), OutVT, Op1); + DCI.AddToWorklist(Op.getNode()); + return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); + } } } // If we have (truncstoreiN (extract_vector_elt X, Y), Z) then it is better diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 24a3f4bb5d4..7a3b6fa85ae 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -201,6 +201,15 @@ enum { // Pack vector operands 0 and 1 into a single vector with half-sized elements. PACK, + // Unpack the first half of vector operand 0 into double-sized elements. + // UNPACK_HIGH sign-extends and UNPACKL_HIGH zero-extends. + UNPACK_HIGH, + UNPACKL_HIGH, + + // Likewise for the second half. + UNPACK_LOW, + UNPACKL_LOW, + // Shift each element of vector operand 0 by the number of bits specified // by scalar operand 1. VSHL_BY_SCALAR, @@ -306,6 +315,23 @@ public: // want to clobber the upper 32 bits of a GPR unnecessarily. return MVT::i32; } + TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) + const override { + // Widen subvectors to the full width rather than promoting integer + // elements. This is better because: + // + // (a) it means that we can handle the ABI for passing and returning + // sub-128 vectors without having to handle them as legal types. + // + // (b) we don't have instructions to extend on load and truncate on store, + // so promoting the integers is less efficient. + // + // (c) there are no multiplication instructions for the widest integer + // type (v2i64). + if (VT.getVectorElementType().getSizeInBits() % 8 == 0) + return TypeWidenVector; + return TargetLoweringBase::getPreferredVectorAction(VT); + } EVT getSetCCResultType(LLVMContext &, EVT) const override; bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; @@ -417,6 +443,8 @@ private: SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG, + unsigned UnpackHigh) const; SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const; SDValue combineExtract(SDLoc DL, EVT ElemVT, EVT VecVT, SDValue OrigOp, diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td index 8abaeb69a20..f95714d1e70 100644 --- a/lib/Target/SystemZ/SystemZInstrVector.td +++ b/lib/Target/SystemZ/SystemZInstrVector.td @@ -290,24 +290,24 @@ let Predicates = [FeatureVector] in { def : Pat<(z_vsei32_by_parts (v4i32 VR128:$src)), (VSEGF VR128:$src)>; // Unpack high. - def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, null_frag, v128h, v128b, 0>; - def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, null_frag, v128f, v128h, 1>; - def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, null_frag, v128g, v128f, 2>; + def VUPHB : UnaryVRRa<"vuphb", 0xE7D7, z_unpack_high, v128h, v128b, 0>; + def VUPHH : UnaryVRRa<"vuphh", 0xE7D7, z_unpack_high, v128f, v128h, 1>; + def VUPHF : UnaryVRRa<"vuphf", 0xE7D7, z_unpack_high, v128g, v128f, 2>; // Unpack logical high. - def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, null_frag, v128h, v128b, 0>; - def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, null_frag, v128f, v128h, 1>; - def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, null_frag, v128g, v128f, 2>; + def VUPLHB : UnaryVRRa<"vuplhb", 0xE7D5, z_unpackl_high, v128h, v128b, 0>; + def VUPLHH : UnaryVRRa<"vuplhh", 0xE7D5, z_unpackl_high, v128f, v128h, 1>; + def VUPLHF : UnaryVRRa<"vuplhf", 0xE7D5, z_unpackl_high, v128g, v128f, 2>; // Unpack low. - def VUPLB : UnaryVRRa<"vuplb", 0xE7D6, null_frag, v128h, v128b, 0>; - def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, null_frag, v128f, v128h, 1>; - def VUPLF : UnaryVRRa<"vuplf", 0xE7D6, null_frag, v128g, v128f, 2>; + def VUPLB : UnaryVRRa<"vuplb", 0xE7D6, z_unpack_low, v128h, v128b, 0>; + def VUPLHW : UnaryVRRa<"vuplhw", 0xE7D6, z_unpack_low, v128f, v128h, 1>; + def VUPLF : UnaryVRRa<"vuplf", 0xE7D6, z_unpack_low, v128g, v128f, 2>; // Unpack logical low. - def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, null_frag, v128h, v128b, 0>; - def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, null_frag, v128f, v128h, 1>; - def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, null_frag, v128g, v128f, 2>; + def VUPLLB : UnaryVRRa<"vupllb", 0xE7D4, z_unpackl_low, v128h, v128b, 0>; + def VUPLLH : UnaryVRRa<"vupllh", 0xE7D4, z_unpackl_low, v128f, v128h, 1>; + def VUPLLF : UnaryVRRa<"vupllf", 0xE7D4, z_unpackl_low, v128g, v128f, 2>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index 63c217413ac..9bf288aa68e 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -193,6 +193,10 @@ def z_permute_dwords : SDNode<"SystemZISD::PERMUTE_DWORDS", SDT_ZVecTernaryInt>; def z_permute : SDNode<"SystemZISD::PERMUTE", SDT_ZVecTernary>; def z_pack : SDNode<"SystemZISD::PACK", SDT_ZVecBinaryConv>; +def z_unpack_high : SDNode<"SystemZISD::UNPACK_HIGH", SDT_ZVecUnaryConv>; +def z_unpackl_high : SDNode<"SystemZISD::UNPACKL_HIGH", SDT_ZVecUnaryConv>; +def z_unpack_low : SDNode<"SystemZISD::UNPACK_LOW", SDT_ZVecUnaryConv>; +def z_unpackl_low : SDNode<"SystemZISD::UNPACKL_LOW", SDT_ZVecUnaryConv>; def z_vshl_by_scalar : SDNode<"SystemZISD::VSHL_BY_SCALAR", SDT_ZVecBinaryInt>; def z_vsrl_by_scalar : SDNode<"SystemZISD::VSRL_BY_SCALAR", @@ -544,11 +548,12 @@ def z_vllezi64 : PatFrag<(ops node:$addr), def z_vllezf32 : PatFrag<(ops node:$addr), (bitconvert (z_merge_high - (v2i64 (bitconvert - (z_merge_high - (v4f32 (z_vzero)), - (v4f32 (scalar_to_vector - (f32 (load node:$addr))))))), + (v2i64 + (z_unpackl_high + (v4i32 + (bitconvert + (v4f32 (scalar_to_vector + (f32 (load node:$addr)))))))), (v2i64 (z_vzero))))>; def z_vllezf64 : PatFrag<(ops node:$addr), (z_merge_high diff --git a/test/CodeGen/SystemZ/vec-args-03.ll b/test/CodeGen/SystemZ/vec-args-03.ll index e9f51c5e9ee..c47d8461021 100644 --- a/test/CodeGen/SystemZ/vec-args-03.ll +++ b/test/CodeGen/SystemZ/vec-args-03.ll @@ -14,3 +14,17 @@ define <4 x i32> @foo(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4 %y = sub <4 x i32> %v2, %v10 ret <4 x i32> %y } + +; This routine has 10 vector arguments, which fill up %v24-%v31 and +; the two single-wide stack slots at 160 and 168. +define <4 x i8> @bar(<4 x i8> %v1, <4 x i8> %v2, <4 x i8> %v3, <4 x i8> %v4, + <4 x i8> %v5, <4 x i8> %v6, <4 x i8> %v7, <4 x i8> %v8, + <4 x i8> %v9, <4 x i8> %v10) { +; CHECK-LABEL: bar: +; CHECK: vlrepg [[REG1:%v[0-9]+]], 168(%r15) +; CHECK: vsb %v24, %v26, [[REG1]] +; CHECK: br %r14 + %y = sub <4 x i8> %v2, %v10 + ret <4 x i8> %y +} + diff --git a/test/CodeGen/SystemZ/vec-args-04.ll b/test/CodeGen/SystemZ/vec-args-04.ll new file mode 100644 index 00000000000..3a25404934e --- /dev/null +++ b/test/CodeGen/SystemZ/vec-args-04.ll @@ -0,0 +1,50 @@ +; Test the handling of named short vector arguments. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK + +; This routine has 12 vector arguments, which fill up %v24-%v31 +; and the four single-wide stack slots starting at 160. +declare void @bar(<1 x i8>, <2 x i8>, <4 x i8>, <8 x i8>, + <1 x i8>, <2 x i8>, <4 x i8>, <8 x i8>, + <1 x i8>, <2 x i8>, <4 x i8>, <8 x i8>) + +define void @foo() { +; CHECK-VEC-LABEL: foo: +; CHECK-VEC-DAG: vrepib %v24, 1 +; CHECK-VEC-DAG: vrepib %v26, 2 +; CHECK-VEC-DAG: vrepib %v28, 3 +; CHECK-VEC-DAG: vrepib %v30, 4 +; CHECK-VEC-DAG: vrepib %v25, 5 +; CHECK-VEC-DAG: vrepib %v27, 6 +; CHECK-VEC-DAG: vrepib %v29, 7 +; CHECK-VEC-DAG: vrepib %v31, 8 +; CHECK-VEC: brasl %r14, bar@PLT +; +; CHECK-STACK-LABEL: foo: +; CHECK-STACK: aghi %r15, -192 +; CHECK-STACK-DAG: llihh [[REG1:%r[0-9]+]], 2304 +; CHECK-STACK-DAG: stg [[REG1]], 160(%r15) +; CHECK-STACK-DAG: llihh [[REG2:%r[0-9]+]], 2570 +; CHECK-STACK-DAG: stg [[REG2]], 168(%r15) +; CHECK-STACK-DAG: llihf [[REG3:%r[0-9]+]], 185273099 +; CHECK-STACK-DAG: stg [[REG3]], 176(%r15) +; CHECK-STACK-DAG: llihf [[REG4:%r[0-9]+]], 202116108 +; CHECK-STACK-DAG: oilf [[REG4]], 202116108 +; CHECK-STACK-DAG: stg [[REG4]], 176(%r15) +; CHECK-STACK: brasl %r14, bar@PLT + + call void @bar (<1 x i8> , + <2 x i8> , + <4 x i8> , + <8 x i8> , + <1 x i8> , + <2 x i8> , + <4 x i8> , + <8 x i8> , + <1 x i8> , + <2 x i8> , + <4 x i8> , + <8 x i8> ) + ret void +} diff --git a/test/CodeGen/SystemZ/vec-args-05.ll b/test/CodeGen/SystemZ/vec-args-05.ll new file mode 100644 index 00000000000..cd1448b8611 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-args-05.ll @@ -0,0 +1,32 @@ +; Test the handling of unnamed short vector arguments. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-VEC +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s -check-prefix=CHECK-STACK + +; This routine is called with two named vector argument (passed +; in %v24 and %v26) and two unnamed vector arguments (passed +; in the single-wide stack slots at 160 and 168). +declare void @bar(<4 x i8>, <4 x i8>, ...) + +define void @foo() { +; CHECK-VEC-LABEL: foo: +; CHECK-VEC-DAG: vrepib %v24, 1 +; CHECK-VEC-DAG: vrepib %v26, 2 +; CHECK-VEC: brasl %r14, bar@PLT +; +; CHECK-STACK-LABEL: foo: +; CHECK-STACK: aghi %r15, -176 +; CHECK-STACK-DAG: llihf [[REG1:%r[0-9]+]], 50529027 +; CHECK-STACK-DAG: stg [[REG1]], 160(%r15) +; CHECK-STACK-DAG: llihf [[REG2:%r[0-9]+]], 67372036 +; CHECK-STACK-DAG: stg [[REG2]], 168(%r15) +; CHECK-STACK: brasl %r14, bar@PLT + + call void (<4 x i8>, <4 x i8>, ...) @bar + (<4 x i8> , + <4 x i8> , + <4 x i8> , + <4 x i8> ) + ret void +} + diff --git a/test/CodeGen/SystemZ/vec-combine-01.ll b/test/CodeGen/SystemZ/vec-combine-01.ll index f9da34b6475..a3593442172 100644 --- a/test/CodeGen/SystemZ/vec-combine-01.ll +++ b/test/CodeGen/SystemZ/vec-combine-01.ll @@ -105,3 +105,51 @@ define i16 @f5(<4 x i32> %v1, <4 x i32> %v2, <2 x i64> %v3) { %res = add i16 %elem1, %elem2 ret i16 %res } + +; Test a case where an unpack high can be eliminated from the usual +; load-extend sequence. +define void @f6(<8 x i8> *%ptr1, i8 *%ptr2, i8 *%ptr3, i8 *%ptr4) { +; CHECK-LABEL: f6: +; CHECK: vlrepg [[REG:%v[0-9]+]], 0(%r2) +; CHECK-NOT: vup +; CHECK-DAG: vsteb [[REG]], 0(%r3), 1 +; CHECK-DAG: vsteb [[REG]], 0(%r4), 2 +; CHECK-DAG: vsteb [[REG]], 0(%r5), 7 +; CHECK: br %r14 + %vec = load <8 x i8>, <8 x i8> *%ptr1 + %ext = sext <8 x i8> %vec to <8 x i16> + %elem1 = extractelement <8 x i16> %ext, i32 1 + %elem2 = extractelement <8 x i16> %ext, i32 2 + %elem3 = extractelement <8 x i16> %ext, i32 7 + %trunc1 = trunc i16 %elem1 to i8 + %trunc2 = trunc i16 %elem2 to i8 + %trunc3 = trunc i16 %elem3 to i8 + store i8 %trunc1, i8 *%ptr2 + store i8 %trunc2, i8 *%ptr3 + store i8 %trunc3, i8 *%ptr4 + ret void +} + +; ...and again with a bitcast inbetween. +define void @f7(<4 x i8> *%ptr1, i8 *%ptr2, i8 *%ptr3, i8 *%ptr4) { +; CHECK-LABEL: f7: +; CHECK: vlrepf [[REG:%v[0-9]+]], 0(%r2) +; CHECK-NOT: vup +; CHECK-DAG: vsteb [[REG]], 0(%r3), 0 +; CHECK-DAG: vsteb [[REG]], 0(%r4), 1 +; CHECK-DAG: vsteb [[REG]], 0(%r5), 3 +; CHECK: br %r14 + %vec = load <4 x i8>, <4 x i8> *%ptr1 + %ext = sext <4 x i8> %vec to <4 x i32> + %bitcast = bitcast <4 x i32> %ext to <8 x i16> + %elem1 = extractelement <8 x i16> %bitcast, i32 1 + %elem2 = extractelement <8 x i16> %bitcast, i32 3 + %elem3 = extractelement <8 x i16> %bitcast, i32 7 + %trunc1 = trunc i16 %elem1 to i8 + %trunc2 = trunc i16 %elem2 to i8 + %trunc3 = trunc i16 %elem3 to i8 + store i8 %trunc1, i8 *%ptr2 + store i8 %trunc2, i8 *%ptr3 + store i8 %trunc3, i8 *%ptr4 + ret void +} diff --git a/test/CodeGen/SystemZ/vec-combine-02.ll b/test/CodeGen/SystemZ/vec-combine-02.ll new file mode 100644 index 00000000000..db0bf849017 --- /dev/null +++ b/test/CodeGen/SystemZ/vec-combine-02.ll @@ -0,0 +1,433 @@ +; Test various representations of pack-like operations. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; One way of writing a <4 x i32> -> <8 x i16> pack. +define <8 x i16> @f1(<4 x i32> %val0, <4 x i32> %val1) { +; CHECK-LABEL: f1: +; CHECK: vpkf %v24, %v24, %v26 +; CHECK: br %r14 + %elem0 = extractelement <4 x i32> %val0, i32 0 + %elem1 = extractelement <4 x i32> %val0, i32 1 + %elem2 = extractelement <4 x i32> %val0, i32 2 + %elem3 = extractelement <4 x i32> %val0, i32 3 + %elem4 = extractelement <4 x i32> %val1, i32 0 + %elem5 = extractelement <4 x i32> %val1, i32 1 + %elem6 = extractelement <4 x i32> %val1, i32 2 + %elem7 = extractelement <4 x i32> %val1, i32 3 + %hboth0 = bitcast i32 %elem0 to <2 x i16> + %hboth1 = bitcast i32 %elem1 to <2 x i16> + %hboth2 = bitcast i32 %elem2 to <2 x i16> + %hboth3 = bitcast i32 %elem3 to <2 x i16> + %hboth4 = bitcast i32 %elem4 to <2 x i16> + %hboth5 = bitcast i32 %elem5 to <2 x i16> + %hboth6 = bitcast i32 %elem6 to <2 x i16> + %hboth7 = bitcast i32 %elem7 to <2 x i16> + %hlow0 = shufflevector <2 x i16> %hboth0, <2 x i16> %hboth1, + <2 x i32> + %hlow1 = shufflevector <2 x i16> %hboth2, <2 x i16> %hboth3, + <2 x i32> + %hlow2 = shufflevector <2 x i16> %hboth4, <2 x i16> %hboth5, + <2 x i32> + %hlow3 = shufflevector <2 x i16> %hboth6, <2 x i16> %hboth7, + <2 x i32> + %join0 = shufflevector <2 x i16> %hlow0, <2 x i16> %hlow1, + <4 x i32> + %join1 = shufflevector <2 x i16> %hlow2, <2 x i16> %hlow3, + <4 x i32> + %ret = shufflevector <4 x i16> %join0, <4 x i16> %join1, + <8 x i32> + ret <8 x i16> %ret +} + +; A different way of writing a <4 x i32> -> <8 x i16> pack. +define <8 x i16> @f2(<4 x i32> %val0, <4 x i32> %val1) { +; CHECK-LABEL: f2: +; CHECK: vpkf %v24, %v24, %v26 +; CHECK: br %r14 + %elem0 = extractelement <4 x i32> %val0, i32 0 + %elem1 = extractelement <4 x i32> %val0, i32 1 + %elem2 = extractelement <4 x i32> %val0, i32 2 + %elem3 = extractelement <4 x i32> %val0, i32 3 + %elem4 = extractelement <4 x i32> %val1, i32 0 + %elem5 = extractelement <4 x i32> %val1, i32 1 + %elem6 = extractelement <4 x i32> %val1, i32 2 + %elem7 = extractelement <4 x i32> %val1, i32 3 + %wvec0 = insertelement <4 x i32> undef, i32 %elem0, i32 0 + %wvec1 = insertelement <4 x i32> undef, i32 %elem1, i32 0 + %wvec2 = insertelement <4 x i32> undef, i32 %elem2, i32 0 + %wvec3 = insertelement <4 x i32> undef, i32 %elem3, i32 0 + %wvec4 = insertelement <4 x i32> undef, i32 %elem4, i32 0 + %wvec5 = insertelement <4 x i32> undef, i32 %elem5, i32 0 + %wvec6 = insertelement <4 x i32> undef, i32 %elem6, i32 0 + %wvec7 = insertelement <4 x i32> undef, i32 %elem7, i32 0 + %hvec0 = bitcast <4 x i32> %wvec0 to <8 x i16> + %hvec1 = bitcast <4 x i32> %wvec1 to <8 x i16> + %hvec2 = bitcast <4 x i32> %wvec2 to <8 x i16> + %hvec3 = bitcast <4 x i32> %wvec3 to <8 x i16> + %hvec4 = bitcast <4 x i32> %wvec4 to <8 x i16> + %hvec5 = bitcast <4 x i32> %wvec5 to <8 x i16> + %hvec6 = bitcast <4 x i32> %wvec6 to <8 x i16> + %hvec7 = bitcast <4 x i32> %wvec7 to <8 x i16> + %hlow0 = shufflevector <8 x i16> %hvec0, <8 x i16> %hvec1, + <8 x i32> + %hlow1 = shufflevector <8 x i16> %hvec2, <8 x i16> %hvec3, + <8 x i32> + %hlow2 = shufflevector <8 x i16> %hvec4, <8 x i16> %hvec5, + <8 x i32> + %hlow3 = shufflevector <8 x i16> %hvec6, <8 x i16> %hvec7, + <8 x i32> + %join0 = shufflevector <8 x i16> %hlow0, <8 x i16> %hlow1, + <8 x i32> + %join1 = shufflevector <8 x i16> %hlow2, <8 x i16> %hlow3, + <8 x i32> + %ret = shufflevector <8 x i16> %join0, <8 x i16> %join1, + <8 x i32> + ret <8 x i16> %ret +} + +; A direct pack operation. +define <8 x i16> @f3(<4 x i32> %val0, <4 x i32> %val1) { +; CHECK-LABEL: f3: +; CHECK: vpkf %v24, %v24, %v26 +; CHECK: br %r14 + %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16> + %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16> + %ret = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1, + <8 x i32> + ret <8 x i16> %ret +} + +; One way of writing a <4 x i32> -> <16 x i8> pack. It doesn't matter +; whether the first pack is VPKF or VPKH since the even bytes of the +; result are discarded. +define <16 x i8> @f4(<4 x i32> %val0, <4 x i32> %val1, + <4 x i32> %val2, <4 x i32> %val3) { +; CHECK-LABEL: f4: +; CHECK-DAG: vpk{{[hf]}} [[REG1:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vpk{{[hf]}} [[REG2:%v[0-9]+]], %v28, %v30 +; CHECK: vpkh %v24, [[REG1]], [[REG2]] +; CHECK: br %r14 + %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16> + %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16> + %bitcast2 = bitcast <4 x i32> %val2 to <8 x i16> + %bitcast3 = bitcast <4 x i32> %val3 to <8 x i16> + %join0 = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1, + <8 x i32> + %join1 = shufflevector <8 x i16> %bitcast2, <8 x i16> %bitcast3, + <8 x i32> + %bitcast4 = bitcast <8 x i16> %join0 to <16 x i8> + %bitcast5 = bitcast <8 x i16> %join1 to <16 x i8> + %ret = shufflevector <16 x i8> %bitcast4, <16 x i8> %bitcast5, + <16 x i32> + ret <16 x i8> %ret +} + +; Check the same operation, but with elements being extracted from the result. +define void @f5(<4 x i32> %val0, <4 x i32> %val1, + <4 x i32> %val2, <4 x i32> %val3, + i8 *%base) { +; CHECK-LABEL: f5: +; CHECK-DAG: vsteb %v24, 0(%r2), 11 +; CHECK-DAG: vsteb %v26, 1(%r2), 15 +; CHECK-DAG: vsteb %v28, 2(%r2), 3 +; CHECK-DAG: vsteb %v30, 3(%r2), 7 +; CHECK: br %r14 + %bitcast0 = bitcast <4 x i32> %val0 to <8 x i16> + %bitcast1 = bitcast <4 x i32> %val1 to <8 x i16> + %bitcast2 = bitcast <4 x i32> %val2 to <8 x i16> + %bitcast3 = bitcast <4 x i32> %val3 to <8 x i16> + %join0 = shufflevector <8 x i16> %bitcast0, <8 x i16> %bitcast1, + <8 x i32> + %join1 = shufflevector <8 x i16> %bitcast2, <8 x i16> %bitcast3, + <8 x i32> + %bitcast4 = bitcast <8 x i16> %join0 to <16 x i8> + %bitcast5 = bitcast <8 x i16> %join1 to <16 x i8> + %vec = shufflevector <16 x i8> %bitcast4, <16 x i8> %bitcast5, + <16 x i32> + + %ptr0 = getelementptr i8, i8 *%base, i64 0 + %ptr1 = getelementptr i8, i8 *%base, i64 1 + %ptr2 = getelementptr i8, i8 *%base, i64 2 + %ptr3 = getelementptr i8, i8 *%base, i64 3 + + %byte0 = extractelement <16 x i8> %vec, i32 2 + %byte1 = extractelement <16 x i8> %vec, i32 7 + %byte2 = extractelement <16 x i8> %vec, i32 8 + %byte3 = extractelement <16 x i8> %vec, i32 13 + + store i8 %byte0, i8 *%ptr0 + store i8 %byte1, i8 *%ptr1 + store i8 %byte2, i8 *%ptr2 + store i8 %byte3, i8 *%ptr3 + + ret void +} + +; A different way of writing a <4 x i32> -> <16 x i8> pack. +define <16 x i8> @f6(<4 x i32> %val0, <4 x i32> %val1, + <4 x i32> %val2, <4 x i32> %val3) { +; CHECK-LABEL: f6: +; CHECK-DAG: vpk{{[hf]}} [[REG1:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vpk{{[hf]}} [[REG2:%v[0-9]+]], %v28, %v30 +; CHECK: vpkh %v24, [[REG1]], [[REG2]] +; CHECK: br %r14 + %elem0 = extractelement <4 x i32> %val0, i32 0 + %elem1 = extractelement <4 x i32> %val0, i32 1 + %elem2 = extractelement <4 x i32> %val0, i32 2 + %elem3 = extractelement <4 x i32> %val0, i32 3 + %elem4 = extractelement <4 x i32> %val1, i32 0 + %elem5 = extractelement <4 x i32> %val1, i32 1 + %elem6 = extractelement <4 x i32> %val1, i32 2 + %elem7 = extractelement <4 x i32> %val1, i32 3 + %elem8 = extractelement <4 x i32> %val2, i32 0 + %elem9 = extractelement <4 x i32> %val2, i32 1 + %elem10 = extractelement <4 x i32> %val2, i32 2 + %elem11 = extractelement <4 x i32> %val2, i32 3 + %elem12 = extractelement <4 x i32> %val3, i32 0 + %elem13 = extractelement <4 x i32> %val3, i32 1 + %elem14 = extractelement <4 x i32> %val3, i32 2 + %elem15 = extractelement <4 x i32> %val3, i32 3 + %bitcast0 = bitcast i32 %elem0 to <2 x i16> + %bitcast1 = bitcast i32 %elem1 to <2 x i16> + %bitcast2 = bitcast i32 %elem2 to <2 x i16> + %bitcast3 = bitcast i32 %elem3 to <2 x i16> + %bitcast4 = bitcast i32 %elem4 to <2 x i16> + %bitcast5 = bitcast i32 %elem5 to <2 x i16> + %bitcast6 = bitcast i32 %elem6 to <2 x i16> + %bitcast7 = bitcast i32 %elem7 to <2 x i16> + %bitcast8 = bitcast i32 %elem8 to <2 x i16> + %bitcast9 = bitcast i32 %elem9 to <2 x i16> + %bitcast10 = bitcast i32 %elem10 to <2 x i16> + %bitcast11 = bitcast i32 %elem11 to <2 x i16> + %bitcast12 = bitcast i32 %elem12 to <2 x i16> + %bitcast13 = bitcast i32 %elem13 to <2 x i16> + %bitcast14 = bitcast i32 %elem14 to <2 x i16> + %bitcast15 = bitcast i32 %elem15 to <2 x i16> + %low0 = shufflevector <2 x i16> %bitcast0, <2 x i16> %bitcast1, + <2 x i32> + %low1 = shufflevector <2 x i16> %bitcast2, <2 x i16> %bitcast3, + <2 x i32> + %low2 = shufflevector <2 x i16> %bitcast4, <2 x i16> %bitcast5, + <2 x i32> + %low3 = shufflevector <2 x i16> %bitcast6, <2 x i16> %bitcast7, + <2 x i32> + %low4 = shufflevector <2 x i16> %bitcast8, <2 x i16> %bitcast9, + <2 x i32> + %low5 = shufflevector <2 x i16> %bitcast10, <2 x i16> %bitcast11, + <2 x i32> + %low6 = shufflevector <2 x i16> %bitcast12, <2 x i16> %bitcast13, + <2 x i32> + %low7 = shufflevector <2 x i16> %bitcast14, <2 x i16> %bitcast15, + <2 x i32> + %bytes0 = bitcast <2 x i16> %low0 to <4 x i8> + %bytes1 = bitcast <2 x i16> %low1 to <4 x i8> + %bytes2 = bitcast <2 x i16> %low2 to <4 x i8> + %bytes3 = bitcast <2 x i16> %low3 to <4 x i8> + %bytes4 = bitcast <2 x i16> %low4 to <4 x i8> + %bytes5 = bitcast <2 x i16> %low5 to <4 x i8> + %bytes6 = bitcast <2 x i16> %low6 to <4 x i8> + %bytes7 = bitcast <2 x i16> %low7 to <4 x i8> + %blow0 = shufflevector <4 x i8> %bytes0, <4 x i8> %bytes1, + <4 x i32> + %blow1 = shufflevector <4 x i8> %bytes2, <4 x i8> %bytes3, + <4 x i32> + %blow2 = shufflevector <4 x i8> %bytes4, <4 x i8> %bytes5, + <4 x i32> + %blow3 = shufflevector <4 x i8> %bytes6, <4 x i8> %bytes7, + <4 x i32> + %join0 = shufflevector <4 x i8> %blow0, <4 x i8> %blow1, + <8 x i32> + %join1 = shufflevector <4 x i8> %blow2, <4 x i8> %blow3, + <8 x i32> + %ret = shufflevector <8 x i8> %join0, <8 x i8> %join1, + <16 x i32> + ret <16 x i8> %ret +} + +; One way of writing a <2 x i64> -> <16 x i8> pack. +define <16 x i8> @f7(<2 x i64> %val0, <2 x i64> %val1, + <2 x i64> %val2, <2 x i64> %val3, + <2 x i64> %val4, <2 x i64> %val5, + <2 x i64> %val6, <2 x i64> %val7) { +; CHECK-LABEL: f7: +; CHECK-DAG: vpk{{[hfg]}} [[REG1:%v[0-9]+]], %v24, %v26 +; CHECK-DAG: vpk{{[hfg]}} [[REG2:%v[0-9]+]], %v28, %v30 +; CHECK-DAG: vpk{{[hfg]}} [[REG3:%v[0-9]+]], %v25, %v27 +; CHECK-DAG: vpk{{[hfg]}} [[REG4:%v[0-9]+]], %v29, %v31 +; CHECK-DAG: vpk{{[hf]}} [[REG5:%v[0-9]+]], [[REG1]], [[REG2]] +; CHECK-DAG: vpk{{[hf]}} [[REG6:%v[0-9]+]], [[REG3]], [[REG4]] +; CHECK: vpkh %v24, [[REG5]], [[REG6]] +; CHECK: br %r14 + %elem0 = extractelement <2 x i64> %val0, i32 0 + %elem1 = extractelement <2 x i64> %val0, i32 1 + %elem2 = extractelement <2 x i64> %val1, i32 0 + %elem3 = extractelement <2 x i64> %val1, i32 1 + %elem4 = extractelement <2 x i64> %val2, i32 0 + %elem5 = extractelement <2 x i64> %val2, i32 1 + %elem6 = extractelement <2 x i64> %val3, i32 0 + %elem7 = extractelement <2 x i64> %val3, i32 1 + %elem8 = extractelement <2 x i64> %val4, i32 0 + %elem9 = extractelement <2 x i64> %val4, i32 1 + %elem10 = extractelement <2 x i64> %val5, i32 0 + %elem11 = extractelement <2 x i64> %val5, i32 1 + %elem12 = extractelement <2 x i64> %val6, i32 0 + %elem13 = extractelement <2 x i64> %val6, i32 1 + %elem14 = extractelement <2 x i64> %val7, i32 0 + %elem15 = extractelement <2 x i64> %val7, i32 1 + %bitcast0 = bitcast i64 %elem0 to <2 x i32> + %bitcast1 = bitcast i64 %elem1 to <2 x i32> + %bitcast2 = bitcast i64 %elem2 to <2 x i32> + %bitcast3 = bitcast i64 %elem3 to <2 x i32> + %bitcast4 = bitcast i64 %elem4 to <2 x i32> + %bitcast5 = bitcast i64 %elem5 to <2 x i32> + %bitcast6 = bitcast i64 %elem6 to <2 x i32> + %bitcast7 = bitcast i64 %elem7 to <2 x i32> + %bitcast8 = bitcast i64 %elem8 to <2 x i32> + %bitcast9 = bitcast i64 %elem9 to <2 x i32> + %bitcast10 = bitcast i64 %elem10 to <2 x i32> + %bitcast11 = bitcast i64 %elem11 to <2 x i32> + %bitcast12 = bitcast i64 %elem12 to <2 x i32> + %bitcast13 = bitcast i64 %elem13 to <2 x i32> + %bitcast14 = bitcast i64 %elem14 to <2 x i32> + %bitcast15 = bitcast i64 %elem15 to <2 x i32> + %low0 = shufflevector <2 x i32> %bitcast0, <2 x i32> %bitcast1, + <2 x i32> + %low1 = shufflevector <2 x i32> %bitcast2, <2 x i32> %bitcast3, + <2 x i32> + %low2 = shufflevector <2 x i32> %bitcast4, <2 x i32> %bitcast5, + <2 x i32> + %low3 = shufflevector <2 x i32> %bitcast6, <2 x i32> %bitcast7, + <2 x i32> + %low4 = shufflevector <2 x i32> %bitcast8, <2 x i32> %bitcast9, + <2 x i32> + %low5 = shufflevector <2 x i32> %bitcast10, <2 x i32> %bitcast11, + <2 x i32> + %low6 = shufflevector <2 x i32> %bitcast12, <2 x i32> %bitcast13, + <2 x i32> + %low7 = shufflevector <2 x i32> %bitcast14, <2 x i32> %bitcast15, + <2 x i32> + %half0 = bitcast <2 x i32> %low0 to <4 x i16> + %half1 = bitcast <2 x i32> %low1 to <4 x i16> + %half2 = bitcast <2 x i32> %low2 to <4 x i16> + %half3 = bitcast <2 x i32> %low3 to <4 x i16> + %half4 = bitcast <2 x i32> %low4 to <4 x i16> + %half5 = bitcast <2 x i32> %low5 to <4 x i16> + %half6 = bitcast <2 x i32> %low6 to <4 x i16> + %half7 = bitcast <2 x i32> %low7 to <4 x i16> + %hlow0 = shufflevector <4 x i16> %half0, <4 x i16> %half1, + <4 x i32> + %hlow1 = shufflevector <4 x i16> %half2, <4 x i16> %half3, + <4 x i32> + %hlow2 = shufflevector <4 x i16> %half4, <4 x i16> %half5, + <4 x i32> + %hlow3 = shufflevector <4 x i16> %half6, <4 x i16> %half7, + <4 x i32> + %bytes0 = bitcast <4 x i16> %hlow0 to <8 x i8> + %bytes1 = bitcast <4 x i16> %hlow1 to <8 x i8> + %bytes2 = bitcast <4 x i16> %hlow2 to <8 x i8> + %bytes3 = bitcast <4 x i16> %hlow3 to <8 x i8> + %join0 = shufflevector <8 x i8> %bytes0, <8 x i8> %bytes1, + <8 x i32> + %join1 = shufflevector <8 x i8> %bytes2, <8 x i8> %bytes3, + <8 x i32> + %ret = shufflevector <8 x i8> %join0, <8 x i8> %join1, + <16 x i32> + ret <16 x i8> %ret +} + +; Test a <2 x i64> -> <4 x f32> pack in which only individual elements are +; needed. +define float @f8(i64 %scalar0, i64 %scalar1, i64 %scalar2, i64 %scalar3) { +; CHECK-LABEL: f8: +; CHECK-NOT: vperm +; CHECK-NOT: vpk +; CHECK-NOT: vmrh +; CHECK: aebr {{%f[0-7]}}, +; CHECK: aebr {{%f[0-7]}}, +; CHECK: meebr %f0, +; CHECK: br %r14 + %vec0 = insertelement <2 x i64> undef, i64 %scalar0, i32 0 + %vec1 = insertelement <2 x i64> undef, i64 %scalar1, i32 0 + %vec2 = insertelement <2 x i64> undef, i64 %scalar2, i32 0 + %vec3 = insertelement <2 x i64> undef, i64 %scalar3, i32 0 + %join0 = shufflevector <2 x i64> %vec0, <2 x i64> %vec1, + <2 x i32> + %join1 = shufflevector <2 x i64> %vec2, <2 x i64> %vec3, + <2 x i32> + %bitcast0 = bitcast <2 x i64> %join0 to <4 x float> + %bitcast1 = bitcast <2 x i64> %join1 to <4 x float> + %pack = shufflevector <4 x float> %bitcast0, <4 x float> %bitcast1, + <4 x i32> + %elt0 = extractelement <4 x float> %pack, i32 0 + %elt1 = extractelement <4 x float> %pack, i32 1 + %elt2 = extractelement <4 x float> %pack, i32 2 + %elt3 = extractelement <4 x float> %pack, i32 3 + %add0 = fadd float %elt0, %elt2 + %add1 = fadd float %elt1, %elt3 + %ret = fmul float %add0, %add1 + ret float %ret +} + +; Test a <2 x f64> -> <4 x i32> pack in which only individual elements are +; needed. +define i32 @f9(double %scalar0, double %scalar1, double %scalar2, + double %scalar3) { +; CHECK-LABEL: f9: +; CHECK-NOT: vperm +; CHECK-NOT: vpk +; CHECK-NOT: vmrh +; CHECK: ar {{%r[0-5]}}, +; CHECK: ar {{%r[0-5]}}, +; CHECK: or %r2, +; CHECK: br %r14 + %vec0 = insertelement <2 x double> undef, double %scalar0, i32 0 + %vec1 = insertelement <2 x double> undef, double %scalar1, i32 0 + %vec2 = insertelement <2 x double> undef, double %scalar2, i32 0 + %vec3 = insertelement <2 x double> undef, double %scalar3, i32 0 + %join0 = shufflevector <2 x double> %vec0, <2 x double> %vec1, + <2 x i32> + %join1 = shufflevector <2 x double> %vec2, <2 x double> %vec3, + <2 x i32> + %bitcast0 = bitcast <2 x double> %join0 to <4 x i32> + %bitcast1 = bitcast <2 x double> %join1 to <4 x i32> + %pack = shufflevector <4 x i32> %bitcast0, <4 x i32> %bitcast1, + <4 x i32> + %elt0 = extractelement <4 x i32> %pack, i32 0 + %elt1 = extractelement <4 x i32> %pack, i32 1 + %elt2 = extractelement <4 x i32> %pack, i32 2 + %elt3 = extractelement <4 x i32> %pack, i32 3 + %add0 = add i32 %elt0, %elt2 + %add1 = add i32 %elt1, %elt3 + %ret = or i32 %add0, %add1 + ret i32 %ret +} diff --git a/test/CodeGen/SystemZ/vec-const-01.ll b/test/CodeGen/SystemZ/vec-const-01.ll index f173b92b015..4cdcbf7c2dc 100644 --- a/test/CodeGen/SystemZ/vec-const-01.ll +++ b/test/CodeGen/SystemZ/vec-const-01.ll @@ -53,3 +53,51 @@ define <16 x i8> @f5() { i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 0, i8 -1> } + +; Test an all-zeros v2i8 that gets promoted to v16i8. +define <2 x i8> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <2 x i8> zeroinitializer +} + +; Test a mixed v2i8 that gets promoted to v16i8 (mask 0x8000). +define <2 x i8> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgbm %v24, 32768 +; CHECK: br %r14 + ret <2 x i8> +} + +; Test an all-zeros v4i8 that gets promoted to v16i8. +define <4 x i8> @f8() { +; CHECK-LABEL: f8: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <4 x i8> zeroinitializer +} + +; Test a mixed v4i8 that gets promoted to v16i8 (mask 0x9000). +define <4 x i8> @f9() { +; CHECK-LABEL: f9: +; CHECK: vgbm %v24, 36864 +; CHECK: br %r14 + ret <4 x i8> +} + +; Test an all-zeros v8i8 that gets promoted to v16i8. +define <8 x i8> @f10() { +; CHECK-LABEL: f10: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <8 x i8> zeroinitializer +} + +; Test a mixed v8i8 that gets promoted to v16i8 (mask 0xE500). +define <8 x i8> @f11() { +; CHECK-LABEL: f11: +; CHECK: vgbm %v24, 58624 +; CHECK: br %r14 + ret <8 x i8> +} diff --git a/test/CodeGen/SystemZ/vec-const-02.ll b/test/CodeGen/SystemZ/vec-const-02.ll index 541cbb9faca..73a89d4a841 100644 --- a/test/CodeGen/SystemZ/vec-const-02.ll +++ b/test/CodeGen/SystemZ/vec-const-02.ll @@ -45,3 +45,35 @@ define <8 x i16> @f5() { ret <8 x i16> } + +; Test an all-zeros v2i16 that gets promoted to v8i16. +define <2 x i16> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <2 x i16> zeroinitializer +} + +; Test a mixed v2i16 that gets promoted to v8i16 (mask 0xc000). +define <2 x i16> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgbm %v24, 49152 +; CHECK: br %r14 + ret <2 x i16> +} + +; Test an all-zeros v4i16 that gets promoted to v8i16. +define <4 x i16> @f8() { +; CHECK-LABEL: f8: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <4 x i16> zeroinitializer +} + +; Test a mixed v4i16 that gets promoted to v8i16 (mask 0x7200). +define <4 x i16> @f9() { +; CHECK-LABEL: f9: +; CHECK: vgbm %v24, 29184 +; CHECK: br %r14 + ret <4 x i16> +} diff --git a/test/CodeGen/SystemZ/vec-const-03.ll b/test/CodeGen/SystemZ/vec-const-03.ll index 45ed83866d5..adc1105229e 100644 --- a/test/CodeGen/SystemZ/vec-const-03.ll +++ b/test/CodeGen/SystemZ/vec-const-03.ll @@ -41,3 +41,19 @@ define <4 x i32> @f5() { ; CHECK: br %r14 ret <4 x i32> } + +; Test an all-zeros v2i32 that gets promoted to v4i32. +define <2 x i32> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <2 x i32> zeroinitializer +} + +; Test a mixed v2i32 that gets promoted to v4i32 (mask 0xae00). +define <2 x i32> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgbm %v24, 44544 +; CHECK: br %r14 + ret <2 x i32> +} diff --git a/test/CodeGen/SystemZ/vec-const-05.ll b/test/CodeGen/SystemZ/vec-const-05.ll index c4828335c4b..55f3cdd5902 100644 --- a/test/CodeGen/SystemZ/vec-const-05.ll +++ b/test/CodeGen/SystemZ/vec-const-05.ll @@ -45,3 +45,19 @@ define <4 x float> @f5() { ret <4 x float> } + +; Test an all-zeros v2f32 that gets promoted to v4f32. +define <2 x float> @f6() { +; CHECK-LABEL: f6: +; CHECK: vgbm %v24, 0 +; CHECK: br %r14 + ret <2 x float> zeroinitializer +} + +; Test a mixed v2f32 that gets promoted to v4f32 (mask 0xc700). +define <2 x float> @f7() { +; CHECK-LABEL: f7: +; CHECK: vgbm %v24, 50944 +; CHECK: br %r14 + ret <2 x float> +} diff --git a/test/CodeGen/SystemZ/vec-move-01.ll b/test/CodeGen/SystemZ/vec-move-01.ll index 896d24a1d20..3ef98b7eda0 100644 --- a/test/CodeGen/SystemZ/vec-move-01.ll +++ b/test/CodeGen/SystemZ/vec-move-01.ll @@ -49,3 +49,59 @@ define <2 x double> @f6(<2 x double> %val1, <2 x double> %val2) { ; CHECK: br %r14 ret <2 x double> %val2 } + +; Test v2i8 moves. +define <2 x i8> @f7(<2 x i8> %val1, <2 x i8> %val2) { +; CHECK-LABEL: f7: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <2 x i8> %val2 +} + +; Test v4i8 moves. +define <4 x i8> @f8(<4 x i8> %val1, <4 x i8> %val2) { +; CHECK-LABEL: f8: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <4 x i8> %val2 +} + +; Test v8i8 moves. +define <8 x i8> @f9(<8 x i8> %val1, <8 x i8> %val2) { +; CHECK-LABEL: f9: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <8 x i8> %val2 +} + +; Test v2i16 moves. +define <2 x i16> @f10(<2 x i16> %val1, <2 x i16> %val2) { +; CHECK-LABEL: f10: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <2 x i16> %val2 +} + +; Test v4i16 moves. +define <4 x i16> @f11(<4 x i16> %val1, <4 x i16> %val2) { +; CHECK-LABEL: f11: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <4 x i16> %val2 +} + +; Test v2i32 moves. +define <2 x i32> @f12(<2 x i32> %val1, <2 x i32> %val2) { +; CHECK-LABEL: f12: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <2 x i32> %val2 +} + +; Test v2f32 moves. +define <2 x float> @f13(<2 x float> %val1, <2 x float> %val2) { +; CHECK-LABEL: f13: +; CHECK: vlr %v24, %v26 +; CHECK: br %r14 + ret <2 x float> %val2 +} diff --git a/test/CodeGen/SystemZ/vec-move-13.ll b/test/CodeGen/SystemZ/vec-move-13.ll index 4ad8e3f5210..165c3498702 100644 --- a/test/CodeGen/SystemZ/vec-move-13.ll +++ b/test/CodeGen/SystemZ/vec-move-13.ll @@ -49,8 +49,8 @@ define <2 x i64> @f4(i64 %val) { ; Test v4f32 insertion into 0. define <4 x float> @f5(float %val) { ; CHECK-LABEL: f5: -; CHECK: vgbm [[ZERO:%v[0-9]+]], 0 -; CHECK: vmrhf [[REG:%v[0-9]+]], [[ZERO]], %v0 +; CHECK-DAG: vuplhf [[REG:%v[0-9]+]], %v0 +; CHECK-DAG: vgbm [[ZERO:%v[0-9]+]], 0 ; CHECK: vmrhg %v24, [[ZERO]], [[REG]] ; CHECK: br %r14 %ret = insertelement <4 x float> zeroinitializer, float %val, i32 3 diff --git a/test/CodeGen/SystemZ/vec-move-15.ll b/test/CodeGen/SystemZ/vec-move-15.ll new file mode 100644 index 00000000000..bf375e111cc --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-15.ll @@ -0,0 +1,106 @@ +; Test vector sign-extending loads. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i1->v16i8 extension. +define <16 x i8> @f1(<16 x i1> *%ptr) { +; No expected output, but must compile. + %val = load <16 x i1>, <16 x i1> *%ptr + %ret = sext <16 x i1> %val to <16 x i8> + ret <16 x i8> %ret +} + +; Test a v8i1->v8i16 extension. +define <8 x i16> @f2(<8 x i1> *%ptr) { +; No expected output, but must compile. + %val = load <8 x i1>, <8 x i1> *%ptr + %ret = sext <8 x i1> %val to <8 x i16> + ret <8 x i16> %ret +} + +; Test a v8i8->v8i16 extension. +define <8 x i16> @f3(<8 x i8> *%ptr) { +; CHECK-LABEL: f3: +; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vuphb %v24, [[REG1]] +; CHECK: br %r14 + %val = load <8 x i8>, <8 x i8> *%ptr + %ret = sext <8 x i8> %val to <8 x i16> + ret <8 x i16> %ret +} + +; Test a v4i1->v4i32 extension. +define <4 x i32> @f4(<4 x i1> *%ptr) { +; No expected output, but must compile. + %val = load <4 x i1>, <4 x i1> *%ptr + %ret = sext <4 x i1> %val to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i8->v4i32 extension. +define <4 x i32> @f5(<4 x i8> *%ptr) { +; CHECK-LABEL: f5: +; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vuphb [[REG2:%v[0-9]+]], [[REG1]] +; CHECK: vuphh %v24, [[REG2]] +; CHECK: br %r14 + %val = load <4 x i8>, <4 x i8> *%ptr + %ret = sext <4 x i8> %val to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i16->v4i32 extension. +define <4 x i32> @f6(<4 x i16> *%ptr) { +; CHECK-LABEL: f6: +; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vuphh %v24, [[REG1]] +; CHECK: br %r14 + %val = load <4 x i16>, <4 x i16> *%ptr + %ret = sext <4 x i16> %val to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v2i1->v2i64 extension. +define <2 x i64> @f7(<2 x i1> *%ptr) { +; No expected output, but must compile. + %val = load <2 x i1>, <2 x i1> *%ptr + %ret = sext <2 x i1> %val to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i8->v2i64 extension. +define <2 x i64> @f8(<2 x i8> *%ptr) { +; CHECK-LABEL: f8: +; CHECK: vlrepb [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vleb [[REG1]], 1(%r2), 1 +; CHECK: vuphb [[REG2:%v[0-9]+]], [[REG1]] +; CHECK: vuphh [[REG3:%v[0-9]+]], [[REG2]] +; CHECK: vuphf %v24, [[REG3]] +; CHECK: br %r14 + %val = load <2 x i8>, <2 x i8> *%ptr + %ret = sext <2 x i8> %val to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i16->v2i64 extension. +define <2 x i64> @f9(<2 x i16> *%ptr) { +; CHECK-LABEL: f9: +; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vuphh [[REG2:%v[0-9]+]], [[REG1]] +; CHECK: vuphf %v24, [[REG2]] +; CHECK: br %r14 + %val = load <2 x i16>, <2 x i16> *%ptr + %ret = sext <2 x i16> %val to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i32->v2i64 extension. +define <2 x i64> @f10(<2 x i32> *%ptr) { +; CHECK-LABEL: f10: +; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vuphf %v24, [[REG1]] +; CHECK: br %r14 + %val = load <2 x i32>, <2 x i32> *%ptr + %ret = sext <2 x i32> %val to <2 x i64> + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-move-16.ll b/test/CodeGen/SystemZ/vec-move-16.ll new file mode 100644 index 00000000000..152b0d4d88c --- /dev/null +++ b/test/CodeGen/SystemZ/vec-move-16.ll @@ -0,0 +1,106 @@ +; Test vector zero-extending loads. +; +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s + +; Test a v16i1->v16i8 extension. +define <16 x i8> @f1(<16 x i1> *%ptr) { +; No expected output, but must compile. + %val = load <16 x i1>, <16 x i1> *%ptr + %ret = zext <16 x i1> %val to <16 x i8> + ret <16 x i8> %ret +} + +; Test a v8i1->v8i16 extension. +define <8 x i16> @f2(<8 x i1> *%ptr) { +; No expected output, but must compile. + %val = load <8 x i1>, <8 x i1> *%ptr + %ret = zext <8 x i1> %val to <8 x i16> + ret <8 x i16> %ret +} + +; Test a v8i8->v8i16 extension. +define <8 x i16> @f3(<8 x i8> *%ptr) { +; CHECK-LABEL: f3: +; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vuplhb %v24, [[REG1]] +; CHECK: br %r14 + %val = load <8 x i8>, <8 x i8> *%ptr + %ret = zext <8 x i8> %val to <8 x i16> + ret <8 x i16> %ret +} + +; Test a v4i1->v4i32 extension. +define <4 x i32> @f4(<4 x i1> *%ptr) { +; No expected output, but must compile. + %val = load <4 x i1>, <4 x i1> *%ptr + %ret = zext <4 x i1> %val to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i8->v4i32 extension. +define <4 x i32> @f5(<4 x i8> *%ptr) { +; CHECK-LABEL: f5: +; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]] +; CHECK: vuplhh %v24, [[REG2]] +; CHECK: br %r14 + %val = load <4 x i8>, <4 x i8> *%ptr + %ret = zext <4 x i8> %val to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v4i16->v4i32 extension. +define <4 x i32> @f6(<4 x i16> *%ptr) { +; CHECK-LABEL: f6: +; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vuplhh %v24, [[REG1]] +; CHECK: br %r14 + %val = load <4 x i16>, <4 x i16> *%ptr + %ret = zext <4 x i16> %val to <4 x i32> + ret <4 x i32> %ret +} + +; Test a v2i1->v2i64 extension. +define <2 x i64> @f7(<2 x i1> *%ptr) { +; No expected output, but must compile. + %val = load <2 x i1>, <2 x i1> *%ptr + %ret = zext <2 x i1> %val to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i8->v2i64 extension. +define <2 x i64> @f8(<2 x i8> *%ptr) { +; CHECK-LABEL: f8: +; CHECK: vlrepb [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vleb [[REG1]], 1(%r2), 1 +; CHECK: vuplhb [[REG2:%v[0-9]+]], [[REG1]] +; CHECK: vuplhh [[REG3:%v[0-9]+]], [[REG2]] +; CHECK: vuplhf %v24, [[REG3]] +; CHECK: br %r14 + %val = load <2 x i8>, <2 x i8> *%ptr + %ret = zext <2 x i8> %val to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i16->v2i64 extension. +define <2 x i64> @f9(<2 x i16> *%ptr) { +; CHECK-LABEL: f9: +; CHECK: vlrepf [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vuplhh [[REG2:%v[0-9]+]], [[REG1]] +; CHECK: vuplhf %v24, [[REG2]] +; CHECK: br %r14 + %val = load <2 x i16>, <2 x i16> *%ptr + %ret = zext <2 x i16> %val to <2 x i64> + ret <2 x i64> %ret +} + +; Test a v2i32->v2i64 extension. +define <2 x i64> @f10(<2 x i32> *%ptr) { +; CHECK-LABEL: f10: +; CHECK: vlrepg [[REG1:%v[0-9]+]], 0(%r2) +; CHECK: vuplhf %v24, [[REG1]] +; CHECK: br %r14 + %val = load <2 x i32>, <2 x i32> *%ptr + %ret = zext <2 x i32> %val to <2 x i64> + ret <2 x i64> %ret +} diff --git a/test/CodeGen/SystemZ/vec-sub-01.ll b/test/CodeGen/SystemZ/vec-sub-01.ll index 5620ebcb8c4..4afad8bef65 100644 --- a/test/CodeGen/SystemZ/vec-sub-01.ll +++ b/test/CodeGen/SystemZ/vec-sub-01.ll @@ -85,3 +85,64 @@ define double @f7(<2 x double> %val1, <2 x double> %val2) { %ret = fsub double %scalar1, %scalar2 ret double %ret } + +; Test a v2i8 subtraction, which gets promoted to v16i8. +define <2 x i8> @f8(<2 x i8> %dummy, <2 x i8> %val1, <2 x i8> %val2) { +; CHECK-LABEL: f8: +; CHECK: vsb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <2 x i8> %val1, %val2 + ret <2 x i8> %ret +} + +; Test a v4i8 subtraction, which gets promoted to v16i8. +define <4 x i8> @f9(<4 x i8> %dummy, <4 x i8> %val1, <4 x i8> %val2) { +; CHECK-LABEL: f9: +; CHECK: vsb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <4 x i8> %val1, %val2 + ret <4 x i8> %ret +} + +; Test a v8i8 subtraction, which gets promoted to v16i8. +define <8 x i8> @f10(<8 x i8> %dummy, <8 x i8> %val1, <8 x i8> %val2) { +; CHECK-LABEL: f10: +; CHECK: vsb %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <8 x i8> %val1, %val2 + ret <8 x i8> %ret +} + +; Test a v2i16 subtraction, which gets promoted to v8i16. +define <2 x i16> @f11(<2 x i16> %dummy, <2 x i16> %val1, <2 x i16> %val2) { +; CHECK-LABEL: f11: +; CHECK: vsh %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <2 x i16> %val1, %val2 + ret <2 x i16> %ret +} + +; Test a v4i16 subtraction, which gets promoted to v8i16. +define <4 x i16> @f12(<4 x i16> %dummy, <4 x i16> %val1, <4 x i16> %val2) { +; CHECK-LABEL: f12: +; CHECK: vsh %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <4 x i16> %val1, %val2 + ret <4 x i16> %ret +} + +; Test a v2i32 subtraction, which gets promoted to v4i32. +define <2 x i32> @f13(<2 x i32> %dummy, <2 x i32> %val1, <2 x i32> %val2) { +; CHECK-LABEL: f13: +; CHECK: vsf %v24, %v26, %v28 +; CHECK: br %r14 + %ret = sub <2 x i32> %val1, %val2 + ret <2 x i32> %ret +} + +; Test a v2f32 subtraction, which gets promoted to v4f32. +define <2 x float> @f14(<2 x float> %val1, <2 x float> %val2) { +; No particular output expected, but must compile. + %ret = fsub <2 x float> %val1, %val2 + ret <2 x float> %ret +} -- 2.40.0