[X86][SchedModel] SSE reciprocal square root instruction latencies.

author Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>

Fri, 26 Sep 2014 12:56:44 +0000 (12:56 +0000)

committer Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>

Fri, 26 Sep 2014 12:56:44 +0000 (12:56 +0000)
author Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Fri, 26 Sep 2014 12:56:44 +0000 (12:56 +0000)
committer Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Fri, 26 Sep 2014 12:56:44 +0000 (12:56 +0000)
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td

index 923b3dab0f8bc71972c4811555f1fdd855cc83f4..3fd576ea25a983881b0a4fd720a5be8d7e2841e4 100644 (file)
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3344,6 +3344,16 @@ def SSE_SQRTSD : OpndItins<
  >;
  }
  
+let Sched = WriteFRsqrt in {
+def SSE_RSQRTPS : OpndItins<
+  IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
+>;
+
+def SSE_RSQRTSS : OpndItins<
+  IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
+>;
+}
+
  let Sched = WriteFRcp in {
  def SSE_RCPP : OpndItins<
    IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
@@ -3622,10 +3632,10 @@ defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss,
  
  // Reciprocal approximations. Note that these typically require refinement
  // in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
-             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>,
               sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
-                                int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
+                                int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>;
  defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
               sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
               sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td

index 7bb3569ad33556c9056ca115b2c6fc9ebaf277ae..73a32304302aaa405e6f2c8d886cbef6ca4cd445 100644 (file)
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -129,6 +129,7 @@ defm : HWWriteResPair<WriteFAdd,   HWPort1, 3>;
  defm : HWWriteResPair<WriteFMul,   HWPort0, 5>;
  defm : HWWriteResPair<WriteFDiv,   HWPort0, 12>; // 10-14 cycles.
  defm : HWWriteResPair<WriteFRcp,   HWPort0, 5>;
+defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
  defm : HWWriteResPair<WriteFSqrt,  HWPort0, 15>;
  defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
  defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td

index 83f053425aa139f6fc24f3290b5cb1d01b065644..eca65c2892b7e0cd61aa93806b781c3baf703df7 100644 (file)
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -117,6 +117,7 @@ defm : SBWriteResPair<WriteFAdd,   SBPort1, 3>;
  defm : SBWriteResPair<WriteFMul,   SBPort0, 5>;
  defm : SBWriteResPair<WriteFDiv,   SBPort0, 12>; // 10-14 cycles.
  defm : SBWriteResPair<WriteFRcp,   SBPort0, 5>;
+defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
  defm : SBWriteResPair<WriteFSqrt,  SBPort0, 15>;
  defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
  defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td

index a1c21b8711b07d807ca4a87ebe31eb16f73ebb02..a261356afe6a6aa9d2e6747e62f89f8b68bf8b03 100644 (file)
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -63,12 +63,13 @@ def WriteZero : SchedWrite;
  defm WriteJump : X86SchedWritePair;
  
  // Floating point. This covers both scalar and vector operations.
-defm WriteFAdd  : X86SchedWritePair; // Floating point add/sub/compare.
-defm WriteFMul  : X86SchedWritePair; // Floating point multiplication.
-defm WriteFDiv  : X86SchedWritePair; // Floating point division.
-defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
-defm WriteFRcp  : X86SchedWritePair; // Floating point reciprocal.
-defm WriteFMA   : X86SchedWritePair; // Fused Multiply Add.
+defm WriteFAdd   : X86SchedWritePair; // Floating point add/sub/compare.
+defm WriteFMul   : X86SchedWritePair; // Floating point multiplication.
+defm WriteFDiv   : X86SchedWritePair; // Floating point division.
+defm WriteFSqrt  : X86SchedWritePair; // Floating point square root.
+defm WriteFRcp   : X86SchedWritePair; // Floating point reciprocal estimate.
+defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
+defm WriteFMA    : X86SchedWritePair; // Fused Multiply Add.
  defm WriteFShuffle  : X86SchedWritePair; // Floating point vector shuffles.
  defm WriteFBlend  : X86SchedWritePair; // Floating point vector blends.
  defm WriteFVarBlend  : X86SchedWritePair; // Fp vector variable blends.
@@ -314,6 +315,11 @@ def IIC_SSE_SQRTPD_RM : InstrItinClass;
  def IIC_SSE_SQRTSD_RR : InstrItinClass;
  def IIC_SSE_SQRTSD_RM : InstrItinClass;
  
+def IIC_SSE_RSQRTPS_RR : InstrItinClass;
+def IIC_SSE_RSQRTPS_RM : InstrItinClass;
+def IIC_SSE_RSQRTSS_RR : InstrItinClass;
+def IIC_SSE_RSQRTSS_RM : InstrItinClass;
+
  def IIC_SSE_RCPP_RR : InstrItinClass;
  def IIC_SSE_RCPP_RM : InstrItinClass;
  def IIC_SSE_RCPS_RR : InstrItinClass;
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td

index c8820aa2d8dfd2d56c223a9d590ee7d8024aa329..4c559c9c1798da2ef7dc7e17cfd8946db627fe71 100644 (file)
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -224,6 +224,11 @@ def AtomItineraries : ProcessorItineraries<
    InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
    InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
  
+  InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
+  InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
+
    InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
    InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
    InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td

index ab2c520ea8949ef8185f04bbeee27930fc149927..ce1ece34e431a872d65d22a4938a91dda0ee0312 100644 (file)
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -163,15 +163,15 @@ defm : JWriteResIntPair<WriteJump,  JALU01, 1>;
  // FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
  // FIXME: Double precision latencies
  // FIXME: SS vs PS latencies
-// FIXME: RSQRT latencies
  // FIXME: ymm latencies
  ////////////////////////////////////////////////////////////////////////////////
  
-defm : JWriteResFpuPair<WriteFAdd,      JFPU0,  3>;
-defm : JWriteResFpuPair<WriteFMul,      JFPU1,  2>;
-defm : JWriteResFpuPair<WriteFRcp,      JFPU1,  2>;
-defm : JWriteResFpuPair<WriteFShuffle, JFPU01,  1>;
-defm : JWriteResFpuPair<WriteFBlend,   JFPU01,  1>;
+defm : JWriteResFpuPair<WriteFAdd,        JFPU0,  3>;
+defm : JWriteResFpuPair<WriteFMul,        JFPU1,  2>;
+defm : JWriteResFpuPair<WriteFRcp,        JFPU1,  2>;
+defm : JWriteResFpuPair<WriteFRsqrt,      JFPU1,  2>;
+defm : JWriteResFpuPair<WriteFShuffle,   JFPU01,  1>;
+defm : JWriteResFpuPair<WriteFBlend,     JFPU01,  1>;
  defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
  
  def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td

index 90d85878812462efa8eb3c2defa508f08542f7f7..f95d4fa041774410fca9f143c13b5956adf96cfe 100644 (file)
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -101,6 +101,7 @@ def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
  // Scalar and vector floating point.
  defm : SMWriteResPair<WriteFAdd,   FPC_RSV1, 3>;
  defm : SMWriteResPair<WriteFRcp,   FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
  defm : SMWriteResPair<WriteFSqrt,  FPC_RSV0, 15>;
  defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
  defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
author	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
	Fri, 26 Sep 2014 12:56:44 +0000 (12:56 +0000)
committer	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
	Fri, 26 Sep 2014 12:56:44 +0000 (12:56 +0000)
lib/Target/X86/X86InstrSSE.td		patch \| blob \| history
lib/Target/X86/X86SchedHaswell.td		patch \| blob \| history
lib/Target/X86/X86SchedSandyBridge.td		patch \| blob \| history
lib/Target/X86/X86Schedule.td		patch \| blob \| history
lib/Target/X86/X86ScheduleAtom.td		patch \| blob \| history
lib/Target/X86/X86ScheduleBtVer2.td		patch \| blob \| history
lib/Target/X86/X86ScheduleSLM.td		patch \| blob \| history