SDValue BuildSDIVPow2(SDNode *N);
SDValue BuildUDIV(SDNode *N);
SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags);
- SDValue BuildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags);
- SDValue BuildRsqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations,
- SDNodeFlags *Flags);
- SDValue BuildRsqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations,
- SDNodeFlags *Flags);
+ SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags);
+ SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags);
+ SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags, bool Recip);
+ SDValue buildSqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations,
+ SDNodeFlags *Flags, bool Reciprocal);
+ SDValue buildSqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations,
+ SDNodeFlags *Flags, bool Reciprocal);
SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
bool DemandHighBits = true);
SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
// If this FDIV is part of a reciprocal square root, it may be folded
// into a target-specific square root estimate instruction.
if (N1.getOpcode() == ISD::FSQRT) {
- if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0), Flags)) {
+ if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) {
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
}
} else if (N1.getOpcode() == ISD::FP_EXTEND &&
N1.getOperand(0).getOpcode() == ISD::FSQRT) {
- if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0),
+ if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
Flags)) {
RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
AddToWorklist(RV.getNode());
}
} else if (N1.getOpcode() == ISD::FP_ROUND &&
N1.getOperand(0).getOpcode() == ISD::FSQRT) {
- if (SDValue RV = BuildRsqrtEstimate(N1.getOperand(0).getOperand(0),
+ if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
Flags)) {
RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
AddToWorklist(RV.getNode());
if (SqrtOp.getNode()) {
// We found a FSQRT, so try to make this fold:
// x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
- if (SDValue RV = BuildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) {
+ if (SDValue RV = buildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) {
RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags);
AddToWorklist(RV.getNode());
return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
// For now, create a Flags object for use with all unsafe math transforms.
SDNodeFlags Flags;
Flags.setUnsafeAlgebra(true);
-
- // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
- SDValue RV = BuildRsqrtEstimate(N->getOperand(0), &Flags);
- if (!RV)
- return SDValue();
-
- EVT VT = RV.getValueType();
- SDLoc DL(N);
- RV = DAG.getNode(ISD::FMUL, DL, VT, N->getOperand(0), RV, &Flags);
- AddToWorklist(RV.getNode());
-
- // Unfortunately, RV is now NaN if the input was exactly 0.
- // Select out this case and force the answer to 0.
- SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
- EVT CCVT = getSetCCResultType(VT);
- SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, N->getOperand(0), Zero, ISD::SETEQ);
- AddToWorklist(ZeroCmp.getNode());
- AddToWorklist(RV.getNode());
-
- return DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
- ZeroCmp, Zero, RV);
+ return buildSqrtEstimate(N->getOperand(0), &Flags);
}
/// copysign(x, fp_extend(y)) -> copysign(x, y)
/// =>
/// X_{i+1} = X_i (1.5 - A X_i^2 / 2)
/// As a result, we precompute A/2 prior to the iteration loop.
-SDValue DAGCombiner::BuildRsqrtNROneConst(SDValue Arg, SDValue Est,
- unsigned Iterations,
- SDNodeFlags *Flags) {
+SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
+ unsigned Iterations,
+ SDNodeFlags *Flags, bool Reciprocal) {
EVT VT = Arg.getValueType();
SDLoc DL(Arg);
SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);
Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
AddToWorklist(Est.getNode());
}
+
+ // If non-reciprocal square root is requested, multiply the result by Arg.
+ if (!Reciprocal) {
+ Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
+ AddToWorklist(Est.getNode());
+ }
+
return Est;
}
/// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
/// =>
/// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
-SDValue DAGCombiner::BuildRsqrtNRTwoConst(SDValue Arg, SDValue Est,
- unsigned Iterations,
- SDNodeFlags *Flags) {
+SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
+ unsigned Iterations,
+ SDNodeFlags *Flags, bool Reciprocal) {
EVT VT = Arg.getValueType();
SDLoc DL(Arg);
SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT);
- // Newton iterations: Est = -0.5 * Est * (-3.0 + Arg * Est * Est)
- for (unsigned i = 0; i < Iterations; ++i) {
- SDValue HalfEst = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
- AddToWorklist(HalfEst.getNode());
-
- Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
- AddToWorklist(Est.getNode());
-
- Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
- AddToWorklist(Est.getNode());
+ // This routine must enter the loop below to work correctly
+ // when (Reciprocal == false).
+ assert(Iterations > 0);
- Est = DAG.getNode(ISD::FADD, DL, VT, Est, MinusThree, Flags);
- AddToWorklist(Est.getNode());
+ // Newton iterations for reciprocal square root:
+ // E = (E * -0.5) * ((A * E) * E + -3.0)
+ for (unsigned i = 0; i < Iterations; ++i) {
+ SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
+ AddToWorklist(AE.getNode());
+
+ SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
+ AddToWorklist(AEE.getNode());
+
+ SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
+ AddToWorklist(RHS.getNode());
+
+ // When calculating a square root at the last iteration build:
+ // S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
+ // (notice a common subexpression)
+ SDValue LHS;
+ if (Reciprocal || (i + 1) < Iterations) {
+ // RSQRT: LHS = (E * -0.5)
+ LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
+ } else {
+ // SQRT: LHS = (A * E) * -0.5
+ LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
+ }
+ AddToWorklist(LHS.getNode());
- Est = DAG.getNode(ISD::FMUL, DL, VT, Est, HalfEst, Flags);
+ Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
AddToWorklist(Est.getNode());
}
+
return Est;
}
-SDValue DAGCombiner::BuildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
+/// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
+/// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
+/// Op can be zero.
+SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags,
+ bool Reciprocal) {
if (Level >= AfterLegalizeDAG)
return SDValue();
if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations, UseOneConstNR)) {
AddToWorklist(Est.getNode());
if (Iterations) {
- Est = UseOneConstNR ?
- BuildRsqrtNROneConst(Op, Est, Iterations, Flags) :
- BuildRsqrtNRTwoConst(Op, Est, Iterations, Flags);
+ Est = UseOneConstNR
+ ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
+ : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
}
return Est;
}
return SDValue();
}
+SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
+ return buildSqrtEstimateImpl(Op, Flags, true);
+}
+
+SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
+ SDValue Est = buildSqrtEstimateImpl(Op, Flags, false);
+ if (!Est)
+ return SDValue();
+
+ // Unfortunately, Est is now NaN if the input was exactly 0.
+ // Select out this case and force the answer to 0.
+ EVT VT = Est.getValueType();
+ SDLoc DL(Op);
+ SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+ EVT CCVT = getSetCCResultType(VT);
+ SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, Zero, ISD::SETEQ);
+ AddToWorklist(ZeroCmp.getNode());
+
+ Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, ZeroCmp,
+ Zero, Est);
+ AddToWorklist(Est.getNode());
+ return Est;
+}
+
/// Return true if base is a frame index, which is known not to alias with
/// anything but itself. Provides base object and offset as results.
static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
--- /dev/null
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2,fma -recip=sqrt:2 -stop-after=expand-isel-pseudos 2>&1 | FileCheck %s
+
+declare float @llvm.sqrt.f32(float) #0
+
+define float @foo(float %f) #0 {
+; CHECK: {{name: *foo}}
+; CHECK: body:
+; CHECK: %0 = COPY %xmm0
+; CHECK: %1 = VRSQRTSSr killed %2, %0
+; CHECK: %3 = VMULSSrr %0, %1
+; CHECK: %4 = VMOVSSrm
+; CHECK: %5 = VFMADDSSr213r %1, killed %3, %4
+; CHECK: %6 = VMOVSSrm
+; CHECK: %7 = VMULSSrr %1, %6
+; CHECK: %8 = VMULSSrr killed %7, killed %5
+; CHECK: %9 = VMULSSrr %0, %8
+; CHECK: %10 = VFMADDSSr213r %8, %9, %4
+; CHECK: %11 = VMULSSrr %9, %6
+; CHECK: %12 = VMULSSrr killed %11, killed %10
+; CHECK: %13 = FsFLD0SS
+; CHECK: %14 = VCMPSSrr %0, killed %13, 0
+; CHECK: %15 = VFsANDNPSrr killed %14, killed %12
+; CHECK: %xmm0 = COPY %15
+; CHECK: RET 0, %xmm0
+ %call = tail call float @llvm.sqrt.f32(float %f) #1
+ ret float %call
+}
+
+define float @rfoo(float %f) #0 {
+; CHECK: {{name: *rfoo}}
+; CHECK: body: |
+; CHECK: %0 = COPY %xmm0
+; CHECK: %1 = VRSQRTSSr killed %2, %0
+; CHECK: %3 = VMULSSrr %0, %1
+; CHECK: %4 = VMOVSSrm
+; CHECK: %5 = VFMADDSSr213r %1, killed %3, %4
+; CHECK: %6 = VMOVSSrm
+; CHECK: %7 = VMULSSrr %1, %6
+; CHECK: %8 = VMULSSrr killed %7, killed %5
+; CHECK: %9 = VMULSSrr %0, %8
+; CHECK: %10 = VFMADDSSr213r %8, killed %9, %4
+; CHECK: %11 = VMULSSrr %8, %6
+; CHECK: %12 = VMULSSrr killed %11, killed %10
+; CHECK: %xmm0 = COPY %12
+; CHECK: RET 0, %xmm0
+ %sqrt = tail call float @llvm.sqrt.f32(float %f)
+ %div = fdiv fast float 1.0, %sqrt
+ ret float %div
+}
+
+attributes #0 = { "unsafe-fp-math"="true" }
+attributes #1 = { nounwind readnone }
; ESTIMATE-LABEL: ff:
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
-; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm3
-; ESTIMATE-NEXT: vmulss %xmm3, %xmm1, %xmm1
+; ESTIMATE-NEXT: vmulss %xmm1, %xmm0, %xmm2
+; ESTIMATE-NEXT: vmulss %xmm1, %xmm2, %xmm1
; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm1, %xmm1
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm2, %xmm2
-; ESTIMATE-NEXT: vmulss %xmm2, %xmm1, %xmm1
+; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm2, %xmm2
+; ESTIMATE-NEXT: vmulss %xmm1, %xmm2, %xmm1
; ESTIMATE-NEXT: vxorps %xmm2, %xmm2, %xmm2
; ESTIMATE-NEXT: vcmpeqss %xmm2, %xmm0, %xmm0
; ESTIMATE-NEXT: vandnps %xmm1, %xmm0, %xmm0
; ESTIMATE-LABEL: reciprocal_square_root:
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1
-; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm2
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0
-; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
+; ESTIMATE-NEXT: vmulss %xmm1, %xmm1, %xmm2
; ESTIMATE-NEXT: vmulss %xmm2, %xmm0, %xmm0
+; ESTIMATE-NEXT: vaddss {{.*}}(%rip), %xmm0, %xmm0
+; ESTIMATE-NEXT: vmulss {{.*}}(%rip), %xmm1, %xmm1
+; ESTIMATE-NEXT: vmulss %xmm0, %xmm1, %xmm0
; ESTIMATE-NEXT: retq
%sqrt = tail call float @llvm.sqrt.f32(float %x)
%div = fdiv fast float 1.0, %sqrt
; ESTIMATE-LABEL: reciprocal_square_root_v4f32:
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtps %xmm0, %xmm1
-; ESTIMATE-NEXT: vmulps %xmm0, %xmm1, %xmm0
-; ESTIMATE-NEXT: vmulps %xmm0, %xmm1, %xmm0
+; ESTIMATE-NEXT: vmulps %xmm1, %xmm1, %xmm2
+; ESTIMATE-NEXT: vmulps %xmm2, %xmm0, %xmm0
; ESTIMATE-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
; ESTIMATE-NEXT: vmulps {{.*}}(%rip), %xmm1, %xmm1
-; ESTIMATE-NEXT: vmulps %xmm1, %xmm0, %xmm0
+; ESTIMATE-NEXT: vmulps %xmm0, %xmm1, %xmm0
; ESTIMATE-NEXT: retq
%sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
%div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
; ESTIMATE-LABEL: reciprocal_square_root_v8f32:
; ESTIMATE: # BB#0:
; ESTIMATE-NEXT: vrsqrtps %ymm0, %ymm1
-; ESTIMATE-NEXT: vmulps %ymm0, %ymm1, %ymm0
-; ESTIMATE-NEXT: vmulps %ymm0, %ymm1, %ymm0
+; ESTIMATE-NEXT: vmulps %ymm1, %ymm1, %ymm2
+; ESTIMATE-NEXT: vmulps %ymm2, %ymm0, %ymm0
; ESTIMATE-NEXT: vaddps {{.*}}(%rip), %ymm0, %ymm0
; ESTIMATE-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
-; ESTIMATE-NEXT: vmulps %ymm1, %ymm0, %ymm0
+; ESTIMATE-NEXT: vmulps %ymm0, %ymm1, %ymm0
; ESTIMATE-NEXT: retq
%sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
%div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt