>;
}
+let Sched = WriteFRsqrt in {
+def SSE_RSQRTPS : OpndItins<
+ IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
+>;
+
+def SSE_RSQRTSS : OpndItins<
+ IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
+>;
+}
+
let Sched = WriteFRcp in {
def SSE_RCPP : OpndItins<
IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
- sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>,
sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
- int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
+ int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>;
defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
+defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles.
defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
+defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>;
defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
defm WriteJump : X86SchedWritePair;
// Floating point. This covers both scalar and vector operations.
-defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
-defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
-defm WriteFDiv : X86SchedWritePair; // Floating point division.
-defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
-defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal.
-defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
+defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
+defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
+defm WriteFDiv : X86SchedWritePair; // Floating point division.
+defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
+defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate.
+defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
+defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles.
defm WriteFBlend : X86SchedWritePair; // Floating point vector blends.
defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends.
def IIC_SSE_SQRTSD_RR : InstrItinClass;
def IIC_SSE_SQRTSD_RM : InstrItinClass;
+def IIC_SSE_RSQRTPS_RR : InstrItinClass;
+def IIC_SSE_RSQRTPS_RM : InstrItinClass;
+def IIC_SSE_RSQRTSS_RR : InstrItinClass;
+def IIC_SSE_RSQRTSS_RM : InstrItinClass;
+
def IIC_SSE_RCPP_RR : InstrItinClass;
def IIC_SSE_RCPP_RM : InstrItinClass;
def IIC_SSE_RCPS_RR : InstrItinClass;
InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
+ InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
+ InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
+
InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
// FIXME: Double precision latencies
// FIXME: SS vs PS latencies
-// FIXME: RSQRT latencies
// FIXME: ymm latencies
////////////////////////////////////////////////////////////////////////////////
-defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
-defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
-defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
-defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
-defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
+defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
+defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
+defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>;
+defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
+defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
// Scalar and vector floating point.
defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>;
defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;