bool isOffsetMultipleOf(SDNode *N, unsigned Val) const;
void transferMemOperands(SDNode *N, SDNode *Result);
- MachineSDNode *flipSignBit(const SDValue &N, SDNode **SignBit = nullptr);
};
} // end anonymous namespace
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Result), {MemOp});
}
-/// This method returns a node after flipping the MSB of each element
-/// of vector integer type. Additionally, if SignBitVec is non-null,
-/// this method sets a node with one at MSB of all elements
-/// and zero at other bits in SignBitVec.
-MachineSDNode *
-PPCDAGToDAGISel::flipSignBit(const SDValue &N, SDNode **SignBitVec) {
- SDLoc dl(N);
- EVT VecVT = N.getValueType();
- if (VecVT == MVT::v4i32) {
- if (SignBitVec) {
- SDNode *ZV = CurDAG->getMachineNode(PPC::V_SET0, dl, MVT::v4i32);
- *SignBitVec = CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT,
- SDValue(ZV, 0));
- }
- return CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, N);
- }
- else if (VecVT == MVT::v8i16) {
- SDNode *Hi = CurDAG->getMachineNode(PPC::LIS, dl, MVT::i32,
- getI32Imm(0x8000, dl));
- SDNode *ScaImm = CurDAG->getMachineNode(PPC::ORI, dl, MVT::i32,
- SDValue(Hi, 0),
- getI32Imm(0x8000, dl));
- SDNode *VecImm = CurDAG->getMachineNode(PPC::MTVSRWS, dl, VecVT,
- SDValue(ScaImm, 0));
- /*
- Alternatively, we can do this as follow to use VRF instead of GPR.
- vspltish 5, 1
- vspltish 6, 15
- vslh 5, 6, 5
- */
- if (SignBitVec) *SignBitVec = VecImm;
- return CurDAG->getMachineNode(PPC::VADDUHM, dl, VecVT, N,
- SDValue(VecImm, 0));
- }
- else if (VecVT == MVT::v16i8) {
- SDNode *VecImm = CurDAG->getMachineNode(PPC::XXSPLTIB, dl, MVT::i32,
- getI32Imm(0x80, dl));
- if (SignBitVec) *SignBitVec = VecImm;
- return CurDAG->getMachineNode(PPC::VADDUBM, dl, VecVT, N,
- SDValue(VecImm, 0));
- }
- else
- llvm_unreachable("Unsupported vector data type for flipSignBit");
-}
-
// Select - Convert the specified operand from a target-independent to a
// target-specific node if it hasn't already been changed.
void PPCDAGToDAGISel::Select(SDNode *N) {
return;
}
}
- case ISD::ABS: {
- assert(PPCSubTarget->hasP9Vector() && "ABS is supported with P9 Vector");
-
- // For vector absolute difference, we use VABSDUW instruction of POWER9.
- // Since VABSDU instructions are for unsigned integers, we need adjustment
- // for signed integers.
- // For abs(sub(a, b)), we generate VABSDUW(a+0x80000000, b+0x80000000).
- // Otherwise, abs(sub(-1, 0)) returns 0xFFFFFFFF(=-1) instead of 1.
- // For abs(a), we generate VABSDUW(a+0x80000000, 0x80000000).
- EVT VecVT = N->getOperand(0).getValueType();
- SDNode *AbsOp = nullptr;
- unsigned AbsOpcode;
-
- if (VecVT == MVT::v4i32)
- AbsOpcode = PPC::VABSDUW;
- else if (VecVT == MVT::v8i16)
- AbsOpcode = PPC::VABSDUH;
- else if (VecVT == MVT::v16i8)
- AbsOpcode = PPC::VABSDUB;
- else
- llvm_unreachable("Unsupported vector data type for ISD::ABS");
-
- // Even for signed integers, we can skip adjustment if all values are
- // known to be positive (as signed integer) due to zero-extended inputs.
- if (N->getOperand(0).getOpcode() == ISD::SUB &&
- N->getOperand(0)->getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
- N->getOperand(0)->getOperand(1).getOpcode() == ISD::ZERO_EXTEND) {
- AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
- SDValue(N->getOperand(0)->getOperand(0)),
- SDValue(N->getOperand(0)->getOperand(1)));
- ReplaceNode(N, AbsOp);
- return;
- }
- if (N->getOperand(0).getOpcode() == ISD::SUB) {
- SDValue SubVal = N->getOperand(0);
- SDNode *Op0 = flipSignBit(SubVal->getOperand(0));
- SDNode *Op1 = flipSignBit(SubVal->getOperand(1));
- AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
- SDValue(Op0, 0), SDValue(Op1, 0));
- }
- else {
- SDNode *Op1 = nullptr;
- SDNode *Op0 = flipSignBit(N->getOperand(0), &Op1);
- AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, SDValue(Op0, 0),
- SDValue(Op1, 0));
- }
- ReplaceNode(N, AbsOp);
- return;
- }
}
SelectCode(N);
setOperationAction(ISD::UREM, MVT::i64, Expand);
}
- if (Subtarget.hasP9Vector()) {
- setOperationAction(ISD::ABS, MVT::v4i32, Legal);
- setOperationAction(ISD::ABS, MVT::v8i16, Legal);
- setOperationAction(ISD::ABS, MVT::v16i8, Legal);
- }
-
// Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
// add/sub are legal for all supported vector VT's.
setOperationAction(ISD::ADD, VT, Legal);
setOperationAction(ISD::SUB, VT, Legal);
+ setOperationAction(ISD::ABS, VT, Custom);
// Vector instructions introduced in P8
if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) {
setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+ // Without hasP8Altivec set, v2i64 SMAX isn't available.
+ // But ABS custom lowering requires SMAX support.
+ if (!Subtarget.hasP8Altivec())
+ setOperationAction(ISD::ABS, MVT::v2i64, Expand);
+
addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
setTargetDAGCombine(ISD::FSQRT);
}
+ if (Subtarget.hasP9Altivec()) {
+ setTargetDAGCombine(ISD::ABS);
+ }
+
// Darwin long double math library functions have $LDBL128 appended.
if (Subtarget.isDarwin()) {
setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
case PPCISD::RFEBB: return "PPCISD::RFEBB";
case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD";
case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN";
+ case PPCISD::VABSD: return "PPCISD::VABSD";
case PPCISD::QVFPERM: return "PPCISD::QVFPERM";
case PPCISD::QVGPCI: return "PPCISD::QVGPCI";
case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI";
return DAG.getRegister(PPC::R2, MVT::i32);
}
- // We are looking for absolute values here.
- // The idea is to try to fit one of two patterns:
- // max (a, (0-a)) OR max ((0-a), a)
- if (Subtarget.hasP9Vector() &&
- (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw ||
- IntrinsicID == Intrinsic::ppc_altivec_vmaxsh ||
- IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) {
- SDValue V1 = Op.getOperand(1);
- SDValue V2 = Op.getOperand(2);
- if (V1.getSimpleValueType() == V2.getSimpleValueType() &&
- (V1.getSimpleValueType() == MVT::v4i32 ||
- V1.getSimpleValueType() == MVT::v8i16 ||
- V1.getSimpleValueType() == MVT::v16i8)) {
- if ( V1.getOpcode() == ISD::SUB &&
- ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
- V1.getOperand(1) == V2 ) {
- // Generate the abs instruction with the operands
- return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2);
- }
-
- if ( V2.getOpcode() == ISD::SUB &&
- ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
- V2.getOperand(1) == V1 ) {
- // Generate the abs instruction with the operands
- return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1);
- }
- }
- }
-
// If this is a lowered altivec predicate compare, CompareOpc is set to the
// opcode number of the comparison.
int CompareOpc;
}
}
+SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
+
+ assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS");
+
+ EVT VT = Op.getValueType();
+ assert(VT.isVector() &&
+ "Only set vector abs as custom, scalar abs shouldn't reach here!");
+ assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
+ VT == MVT::v16i8) &&
+ "Unexpected vector element type!");
+ assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) &&
+ "Current subtarget doesn't support smax v2i64!");
+
+ // For vector abs, it can be lowered to:
+ // abs x
+ // ==>
+ // y = -x
+ // smax(x, y)
+
+ SDLoc dl(Op);
+ SDValue X = Op.getOperand(0);
+ SDValue Zero = DAG.getConstant(0, dl, VT);
+ SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X);
+
+ // SMAX patch https://reviews.llvm.org/D47332
+ // hasn't landed yet, so use intrinsic first here.
+ // TODO: Should use SMAX directly once SMAX patch landed
+ Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw;
+ if (VT == MVT::v2i64)
+ BifID = Intrinsic::ppc_altivec_vmaxsd;
+ else if (VT == MVT::v8i16)
+ BifID = Intrinsic::ppc_altivec_vmaxsh;
+ else if (VT == MVT::v16i8)
+ BifID = Intrinsic::ppc_altivec_vmaxsb;
+
+ return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT);
+}
+
/// LowerOperation - Provide custom lowering hooks for some operations.
///
SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::MUL: return LowerMUL(Op, DAG);
+ case ISD::ABS: return LowerABS(Op, DAG);
// For counter-based loop handling.
case ISD::INTRINSIC_W_CHAIN: return SDValue();
}
}
}
+
+ // Combine vmaxsw/h/b(a, a's negation) to abs(a)
+ // Expose the vabsduw/h/b opportunity for down stream
+ if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() &&
+ (IID == Intrinsic::ppc_altivec_vmaxsw ||
+ IID == Intrinsic::ppc_altivec_vmaxsh ||
+ IID == Intrinsic::ppc_altivec_vmaxsb)) {
+ SDValue V1 = N->getOperand(1);
+ SDValue V2 = N->getOperand(2);
+ if ((V1.getSimpleValueType() == MVT::v4i32 ||
+ V1.getSimpleValueType() == MVT::v8i16 ||
+ V1.getSimpleValueType() == MVT::v16i8) &&
+ V1.getSimpleValueType() == V2.getSimpleValueType()) {
+ // (0-a, a)
+ if (V1.getOpcode() == ISD::SUB &&
+ ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) &&
+ V1.getOperand(1) == V2) {
+ return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2);
+ }
+ // (a, 0-a)
+ if (V2.getOpcode() == ISD::SUB &&
+ ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) &&
+ V2.getOperand(1) == V1) {
+ return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
+ }
+ // (x-y, y-x)
+ if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB &&
+ V1.getOperand(0) == V2.getOperand(1) &&
+ V1.getOperand(1) == V2.getOperand(0)) {
+ return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1);
+ }
+ }
+ }
}
break;
}
case ISD::BUILD_VECTOR:
return DAGCombineBuildVector(N, DCI);
+ case ISD::ABS:
+ return combineABS(N, DCI);
}
return SDValue();
// For non-constant masks, we can always use the record-form and.
return true;
}
+
+// Transform (abs (sub (zext a), (zext b))) to (vabsd a b 0)
+// Transform (abs (sub (zext a), (zext_invec b))) to (vabsd a b 0)
+// Transform (abs (sub (zext_invec a), (zext_invec b))) to (vabsd a b 0)
+// Transform (abs (sub (zext_invec a), (zext b))) to (vabsd a b 0)
+// Transform (abs (sub a, b) to (vabsd a b 1)) if a & b of type v4i32
+SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const {
+ assert((N->getOpcode() == ISD::ABS) && "Need ABS node here");
+ assert(Subtarget.hasP9Altivec() &&
+ "Only combine this when P9 altivec supported!");
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc dl(N);
+ if (N->getOperand(0).getOpcode() == ISD::SUB) {
+ // Even for signed integers, if it's known to be positive (as signed
+ // integer) due to zero-extended inputs.
+ unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode();
+ unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode();
+ if ((SubOpcd0 == ISD::ZERO_EXTEND ||
+ SubOpcd0 == ISD::ZERO_EXTEND_VECTOR_INREG) &&
+ (SubOpcd1 == ISD::ZERO_EXTEND ||
+ SubOpcd1 == ISD::ZERO_EXTEND_VECTOR_INREG)) {
+ return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
+ N->getOperand(0)->getOperand(0),
+ N->getOperand(0)->getOperand(1),
+ DAG.getTargetConstant(0, dl, MVT::i32));
+ }
+
+ // For type v4i32, it can be optimized with xvnegsp + vabsduw
+ if (N->getOperand(0).getValueType() == MVT::v4i32 &&
+ N->getOperand(0).hasOneUse()) {
+ return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(),
+ N->getOperand(0)->getOperand(0),
+ N->getOperand(0)->getOperand(1),
+ DAG.getTargetConstant(1, dl, MVT::i32));
+ }
+ }
+
+ return SDValue();
+}
+
/// An SDNode for swaps that are not associated with any loads/stores
/// and thereby have no chain.
SWAP_NO_CHAIN,
+
+ /// An SDNode for Power9 vector absolute value difference.
+ /// operand #0 vector
+ /// operand #1 vector
+ /// operand #2 constant i32 0 or 1, to indicate whether needs to patch
+ /// the most significant bit for signed i32
+ ///
+ /// Power9 VABSD* instructions are designed to support unsigned integer
+ /// vectors (byte/halfword/word), if we want to make use of them for signed
+ /// integer vectors, we have to flip their sign bits first. To flip sign bit
+ /// for byte/halfword integer vector would become inefficient, but for word
+ /// integer vector, we can leverage XVNEGSP to make it efficiently. eg:
+ /// abs(sub(a,b)) => VABSDUW(a+0x80000000, b+0x80000000)
+ /// => VABSDUW((XVNEGSP a), (XVNEGSP b))
+ VABSD,
/// QVFPERM = This corresponds to the QPX qvfperm instruction.
QVFPERM,
SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const;
/// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
/// SETCC with integer subtraction when (1) there is a legal way of doing it
def SDTVecConv : SDTypeProfile<1, 2, [
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>
]>;
+def SDTVabsd : SDTypeProfile<1, 3, [
+ SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32>
+]>;
+
def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>;
def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>;
def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>;
+def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>;
multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
string asmstr, InstrItinClass itin, Intrinsic Int,
}
}
+// Put this P9Altivec related definition here since it's possible to be
+// selected to VSX instruction xvnegsp, avoid possible undef.
+let Predicates = [HasP9Altivec] in {
+
+ def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 0))),
+ (v4i32 (VABSDUW $A, $B))>;
+
+ def : Pat<(v8i16 (PPCvabsd v8i16:$A, v8i16:$B, (i32 0))),
+ (v8i16 (VABSDUH $A, $B))>;
+
+ def : Pat<(v16i8 (PPCvabsd v16i8:$A, v16i8:$B, (i32 0))),
+ (v16i8 (VABSDUB $A, $B))>;
+
+ // As PPCVABSD description, the last operand indicates whether do the
+ // sign bit flip.
+ def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 1))),
+ (v4i32 (VABSDUW (XVNEGSP $A), (XVNEGSP $B)))>;
+}
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s
; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR8 -implicit-check-not vabsdu
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR7 -implicit-check-not vmaxsd
define <4 x i32> @simple_absv_32(<4 x i32> %a) local_unnamed_addr {
entry:
%0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %a, <4 x i32> %sub.i)
ret <4 x i32> %0
; CHECK-LABEL: simple_absv_32
-; CHECK-DAG: vxor v{{[0-9]+}}, v[[REG:[0-9]+]], v[[REG]]
-; CHECK-DAG: xvnegsp v2, v2
-; CHECK-DAG: xvnegsp v3, v{{[0-9]+}}
-; CHECK-NEXT: vabsduw v2, v2, v{{[0-9]+}}
+; CHECK-NOT: vxor
+; CHECK-NOT: vabsduw
+; CHECK: vnegw v[[REG:[0-9]+]], v2
+; CHECK-NEXT: vmaxsw v2, v2, v[[REG]]
; CHECK-NEXT: blr
; CHECK-PWR8-LABEL: simple_absv_32
; CHECK-PWR8: xxlxor
; CHECK-PWR8: vsubuwm
; CHECK-PWR8: vmaxsw
; CHECK-PWR8: blr
+; CHECK-PWR7-LABEL: simple_absv_32
+; CHECK-PWR7: xxlxor
+; CHECK-PWR7: vsubuwm
+; CHECK-PWR7: vmaxsw
+; CHECK-PWR7: blr
}
define <4 x i32> @simple_absv_32_swap(<4 x i32> %a) local_unnamed_addr {
%0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %sub.i, <4 x i32> %a)
ret <4 x i32> %0
; CHECK-LABEL: simple_absv_32_swap
-; CHECK-DAG: vxor v{{[0-9]+}}, v[[REG:[0-9]+]], v[[REG]]
-; CHECK-DAG: xvnegsp v2, v2
-; CHECK-DAG: xvnegsp v3, v{{[0-9]+}}
-; CHECK-NEXT: vabsduw v2, v2, v{{[0-9]+}}
+; CHECK-NOT: vxor
+; CHECK-NOT: vabsduw
+; CHECK: vnegw v[[REG:[0-9]+]], v2
+; CHECK-NEXT: vmaxsw v2, v2, v[[REG]]
; CHECK-NEXT: blr
; CHECK-PWR8-LABEL: simple_absv_32_swap
; CHECK-PWR8: xxlxor
%0 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %a, <8 x i16> %sub.i)
ret <8 x i16> %0
; CHECK-LABEL: simple_absv_16
-; CHECK: mtvsrws v{{[0-9]+}}, r{{[0-9]+}}
-; CHECK-NEXT: vadduhm v2, v2, v[[IMM:[0-9]+]]
-; CHECK-NEXT: vabsduh v2, v2, v[[IMM]]
+; CHECK-NOT: mtvsrws
+; CHECK-NOT: vabsduh
+; CHECK: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-NEXT: vsubuhm v[[REG:[0-9]+]], v[[ZERO]], v2
+; CHECK-NEXT: vmaxsh v2, v2, v[[REG]]
; CHECK-NEXT: blr
; CHECK-PWR8-LABEL: simple_absv_16
; CHECK-PWR8: xxlxor
; CHECK-PWR8: vsubuhm
; CHECK-PWR8: vmaxsh
; CHECK-PWR8: blr
+; CHECK-PWR7-LABEL: simple_absv_16
+; CHECK-PWR7: xxlxor
+; CHECK-PWR7: vsubuhm
+; CHECK-PWR7: vmaxsh
+; CHECK-PWR7: blr
}
define <16 x i8> @simple_absv_8(<16 x i8> %a) local_unnamed_addr {
%0 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %a, <16 x i8> %sub.i)
ret <16 x i8> %0
; CHECK-LABEL: simple_absv_8
-; CHECK: xxspltib v{{[0-9]+}}, 128
-; CHECK-NEXT: vaddubm v2, v2, v[[IMM:[0-9]+]]
-; CHECK-NEXT: vabsdub v2, v2, v[[IMM]]
+; CHECK-NOT: xxspltib
+; CHECK-NOT: vabsdub
+; CHECK: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-NEXT: vsububm v[[REG:[0-9]+]], v[[ZERO]], v2
+; CHECK-NEXT: vmaxsb v2, v2, v[[REG]]
; CHECK-NEXT: blr
; CHECK-PWR8-LABEL: simple_absv_8
; CHECK-PWR8: xxlxor
; CHECK-PWR8: vsububm
; CHECK-PWR8: vmaxsb
; CHECK-PWR8: blr
+; CHECK-PWR7-LABEL: simple_absv_8
+; CHECK-PWR7: xxlxor
+; CHECK-PWR7: vsububm
+; CHECK-PWR7: vmaxsb
+; CHECK-PWR7: blr
+}
+
+; v2i64 vmax isn't avaiable on pwr7
+define <2 x i64> @sub_absv_64(<2 x i64> %a, <2 x i64> %b) local_unnamed_addr {
+entry:
+ %0 = sub nsw <2 x i64> %a, %b
+ %1 = icmp sgt <2 x i64> %0, <i64 -1, i64 -1>
+ %2 = sub <2 x i64> zeroinitializer, %0
+ %3 = select <2 x i1> %1, <2 x i64> %0, <2 x i64> %2
+ ret <2 x i64> %3
+; CHECK-LABEL: sub_absv_64
+; CHECK: vsubudm
+; CHECK: vnegd
+; CHECK: vmaxsd
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: sub_absv_64
+; CHECK-PWR8-DAG: vsubudm
+; CHECK-PWR8-DAG: xxlxor
+; CHECK-PWR8: vmaxsd
+; CHECK-PWR8: blr
+; CHECK-PWR7-LABEL: sub_absv_64
+; CHECK-PWR7-NOT: vmaxsd
+; CHECK-PWR7: blr
}
; The select pattern can only be detected for v4i32.
%3 = select <4 x i1> %1, <4 x i32> %0, <4 x i32> %2
ret <4 x i32> %3
; CHECK-LABEL: sub_absv_32
-; CHECK-DAG: xvnegsp v3, v3
-; CHECK-DAG: xvnegsp v2, v2
-; CHECK-NEXT: vabsduw v2, v2, v3
+; CHECK-NOT: vsubuwm
+; CHECK-NOT: vnegw
+; CHECK-NOT: vmaxsw
+; CHECK-DAG: xvnegsp v2, v2
+; CHECK-DAG: xvnegsp v3, v3
+; CHECK-NEXT: vabsduw v2, v{{[23]}}, v{{[23]}}
; CHECK-NEXT: blr
; CHECK-PWR8-LABEL: sub_absv_32
-; CHECK-PWR8: vsubuwm
-; CHECK-PWR8: xxlxor
+; CHECK-PWR8-DAG: vsubuwm
+; CHECK-PWR8-DAG: xxlxor
+; CHECK-PWR8: vmaxsw
; CHECK-PWR8: blr
+; CHECK-PWR7-LABEL: sub_absv_32
+; CHECK-PWR7-DAG: vsubuwm
+; CHECK-PWR7-DAG: xxlxor
+; CHECK-PWR7: vmaxsw
+; CHECK-PWR7: blr
+}
+
+define <8 x i16> @sub_absv_16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr {
+entry:
+ %0 = sub nsw <8 x i16> %a, %b
+ %1 = icmp sgt <8 x i16> %0, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ %2 = sub <8 x i16> zeroinitializer, %0
+ %3 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> %2
+ ret <8 x i16> %3
+; CHECK-LABEL: sub_absv_16
+; CHECK-NOT: vabsduh
+; CHECK-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-DAG: vsubuhm v[[SUB:[0-9]+]], v2, v3
+; CHECK: vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]]
+; CHECK-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]]
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: sub_absv_16
+; CHECK-PWR8-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-PWR8-DAG: vsubuhm v[[SUB:[0-9]+]], v2, v3
+; CHECK-PWR8: vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]]
+; CHECK-PWR8-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]]
+; CHECK-PWR8-NEXT: blr
+; CHECK-PWR7-LABEL: sub_absv_16
+; CHECK-PWR7-DAG: vsubuhm
+; CHECK-PWR7-DAG: xxlxor
+; CHECK-PWR7: vmaxsh
+; CHECK-PWR7-NEXT: blr
+}
+
+define <16 x i8> @sub_absv_8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr {
+entry:
+ %0 = sub nsw <16 x i8> %a, %b
+ %1 = icmp sgt <16 x i8> %0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %2 = sub <16 x i8> zeroinitializer, %0
+ %3 = select <16 x i1> %1, <16 x i8> %0, <16 x i8> %2
+ ret <16 x i8> %3
+; CHECK-LABEL: sub_absv_8
+; CHECK-NOT: vabsdub
+; CHECK-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-DAG: vsububm v[[SUB:[0-9]+]], v2, v3
+; CHECK: vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]]
+; CHECK-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]]
+; CHECK-NEXT: blr
+; CHECK-PWR8-LABEL: sub_absv_8
+; CHECK-PWR8-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-PWR8-DAG: vsububm v[[SUB:[0-9]+]], v2, v3
+; CHECK-PWR8: vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]]
+; CHECK-PWR8-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]]
+; CHECK-PWR8-NEXT: blr
+; CHECK-PWR7-LABEL: sub_absv_8
+; CHECK-PWR7-DAG: xxlxor
+; CHECK-PWR7-DAG: vsububm
+; CHECK-PWR7: vmaxsb
+; CHECK-PWR7-NEXT: blr
}
; FIXME: This does not produce the ISD::ABS that we are looking for.
; We do manage to find the word version of ABS but not the halfword.
; Threfore, we end up doing more work than is required with a pair of abs for word
; instead of just one for the halfword.
-define <8 x i16> @sub_absv_16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr {
+define <8 x i16> @sub_absv_16_ext(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr {
entry:
%0 = sext <8 x i16> %a to <8 x i32>
%1 = sext <8 x i16> %b to <8 x i32>
%5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4
%6 = trunc <8 x i32> %5 to <8 x i16>
ret <8 x i16> %6
-; CHECK-LABEL: sub_absv_16
+; CHECK-LABEL: sub_absv_16_ext
; CHECK-NOT: vabsduh
; CHECK: vabsduw
+; CHECK-NOT: vnegw
; CHECK-NOT: vabsduh
; CHECK: vabsduw
+; CHECK-NOT: vnegw
; CHECK-NOT: vabsduh
; CHECK: blr
; CHECK-PWR8-LABEL: sub_absv_16
-; CHECK-PWR8: vsubuwm
-; CHECK-PWR8: xxlxor
+; CHECK-PWR8-DAG: vsubuwm
+; CHECK-PWR8-DAG: xxlxor
; CHECK-PWR8: blr
}
; FIXME: This does not produce ISD::ABS. This does not even vectorize correctly!
; This function should look like sub_absv_32 and sub_absv_16 except that the type is v16i8.
; Function Attrs: norecurse nounwind readnone
-define <16 x i8> @sub_absv_8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr {
+define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr {
entry:
%vecext = extractelement <16 x i8> %a, i32 0
%conv = zext i8 %vecext to i32
%conv122 = trunc i32 %15 to i8
%vecins123 = insertelement <16 x i8> %vecins115, i8 %conv122, i32 15
ret <16 x i8> %vecins123
-; CHECK-LABEL: sub_absv_8
+; CHECK-LABEL: sub_absv_8_ext
; CHECK-NOT: vabsdub
; CHECK: subf
; CHECK-NOT: vabsdub
; CHECK: xor
; CHECK-NOT: vabsdub
; CHECK: blr
-; CHECK-PWR8-LABEL: sub_absv_8
+; CHECK-PWR8-LABEL: sub_absv_8_ext
; CHECK-PWR8: subf
; CHECK-PWR8: xor
; CHECK-PWR8: blr
%0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %sub, <4 x i32> %sub.i)
ret <4 x i32> %0
; CHECK-LABEL: sub_absv_vec_32
-; CHECK: vabsduw v2, v2, v3
+; CHECK-NOT: vsubuwm
+; CHECK-NOT: vnegw
+; CHECK-NOT: vmaxsw
+; CHECK-DAG: xvnegsp v2, v2
+; CHECK-DAG: xvnegsp v3, v3
+; CHECK-NEXT: vabsduw v2, v{{[23]}}, v{{[23]}}
; CHECK-NEXT: blr
; CHECK-PWR8-LABEL: sub_absv_vec_32
-; CHECK-PWR8: xxlxor
-; CHECK-PWR8: vsubuwm
+; CHECK-PWR8-DAG: xxlxor
+; CHECK-PWR8-DAG: vsubuwm
; CHECK-PWR8: vmaxsw
; CHECK-PWR8: blr
}
%0 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %sub, <8 x i16> %sub.i)
ret <8 x i16> %0
; CHECK-LABEL: sub_absv_vec_16
-; CHECK: vabsduh v2, v2, v3
+; CHECK-NOT: mtvsrws
+; CHECK-NOT: vabsduh
+; CHECK-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-DAG: vsubuhm v[[SUB:[0-9]+]], v2, v3
+; CHECK: vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]]
+; CHECK-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]]
; CHECK-NEXT: blr
; CHECK-PWR8-LABEL: sub_absv_vec_16
-; CHECK-PWR8: xxlxor
-; CHECK-PWR8: vsubuhm
+; CHECK-PWR8-DAG: xxlxor
+; CHECK-PWR8-DAG: vsubuhm
; CHECK-PWR8: vmaxsh
; CHECK-PWR8: blr
}
%0 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %sub, <16 x i8> %sub.i)
ret <16 x i8> %0
; CHECK-LABEL: sub_absv_vec_8
-; CHECK: vabsdub v2, v2, v3
+; CHECK-NOT: xxspltib
+; CHECK-NOT: vabsdub
+; CHECK-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]]
+; CHECK-DAG: vsububm v[[SUB:[0-9]+]], v2, v3
+; CHECK: vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]]
+; CHECK-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]]
; CHECK-NEXT: blr
; CHECK-PWR8-LABEL: sub_absv_vec_8
-; CHECK-PWR8: xxlxor
-; CHECK-PWR8: vsububm
+; CHECK-PWR8-DAG: xxlxor
+; CHECK-PWR8-DAG: vsububm
; CHECK-PWR8: vmaxsb
; CHECK-PWR8: blr
}
+define <4 x i32> @zext_sub_absd32(<4 x i16>, <4 x i16>) local_unnamed_addr {
+ %3 = zext <4 x i16> %0 to <4 x i32>
+ %4 = zext <4 x i16> %1 to <4 x i32>
+ %5 = sub <4 x i32> %3, %4
+ %6 = sub <4 x i32> zeroinitializer, %5
+ %7 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %5, <4 x i32> %6)
+ ret <4 x i32> %7
+; CHECK-LABEL: zext_sub_absd32
+; CHECK-NOT: xvnegsp
+; CHECK: vabsduw
+; CHECK: blr
+; CHECK-PWR8-LABEL: zext_sub_absd32
+; CHECK-PWR8: vmaxsw
+; CHECK-PWR8: blr
+}
+
+define <8 x i16> @zext_sub_absd16(<8 x i8>, <8 x i8>) local_unnamed_addr {
+ %3 = zext <8 x i8> %0 to <8 x i16>
+ %4 = zext <8 x i8> %1 to <8 x i16>
+ %5 = sub <8 x i16> %3, %4
+ %6 = sub <8 x i16> zeroinitializer, %5
+ %7 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %5, <8 x i16> %6)
+ ret <8 x i16> %7
+; CHECK-LABEL: zext_sub_absd16
+; CHECK-NOT: vadduhm
+; CHECK: vabsduh
+; CHECK: blr
+; CHECK-PWR8-LABEL: zext_sub_absd16
+; CHECK-PWR8: vmaxsh
+; CHECK-PWR8: blr
+}
+
+define <16 x i8> @zext_sub_absd8(<16 x i4>, <16 x i4>) local_unnamed_addr {
+ %3 = zext <16 x i4> %0 to <16 x i8>
+ %4 = zext <16 x i4> %1 to <16 x i8>
+ %5 = sub <16 x i8> %3, %4
+ %6 = sub <16 x i8> zeroinitializer, %5
+ %7 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %5, <16 x i8> %6)
+ ret <16 x i8> %7
+; CHECK-LABEL: zext_sub_absd8
+; CHECK-NOT: vaddubm
+; CHECK: vabsdub
+; CHECK: blr
+; CHECK-PWR8-LABEL: zext_sub_absd8
+; CHECK-PWR8: vmaxsb
+; CHECK-PWR8: blr
+}
declare <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32>, <4 x i32>)
; CHECK: vperm v4, v0, v4, v3
; CHECK: vperm v2, v5, v0, v2
; CHECK: vperm v3, v0, v5, v3
-; CHECK: xvnegsp v5, v1
-; CHECK: xvnegsp v4, v4
-; CHECK: xvnegsp v2, v2
-; CHECK: xvnegsp v3, v3
; CHECK: vabsduw v3, v4, v3
-; CHECK: vabsduw v2, v5, v2
+; CHECK: vabsduw v2, v1, v2
; CHECK: vadduwm v2, v2, v3
; CHECK: xxswapd v3, v2
; CHECK: vadduwm v2, v2, v3
; P9BE: vperm v4, v5, v4, v3
; P9BE: vperm v2, v5, v0, v2
; P9BE: vperm v3, v5, v0, v3
-; P9BE: xvnegsp v5, v1
-; P9BE: xvnegsp v4, v4
-; P9BE: xvnegsp v2, v2
-; P9BE: xvnegsp v3, v3
; P9BE: vabsduw v3, v4, v3
-; P9BE: vabsduw v2, v5, v2
+; P9BE: vabsduw v2, v1, v2
; P9BE: vadduwm v2, v2, v3
; P9BE: xxswapd v3, v2
; P9BE: vadduwm v2, v2, v3