Use an offset from TOS for idempotent rmw locked op lowering

author Philip Reames <listmail@philipreames.com>

Tue, 14 May 2019 22:32:42 +0000 (22:32 +0000)

committer Philip Reames <listmail@philipreames.com>

Tue, 14 May 2019 22:32:42 +0000 (22:32 +0000)
author Philip Reames <listmail@philipreames.com>
Tue, 14 May 2019 22:32:42 +0000 (22:32 +0000)
committer Philip Reames <listmail@philipreames.com>
Tue, 14 May 2019 22:32:42 +0000 (22:32 +0000)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp

index 43911a1b0165b56ab48fb0c6b21a5171b46de382..7e05fccd338487d79ff5f9aaa75322ccb34a0f0c 100644 (file)
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -26292,21 +26292,31 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
    // here since it doesn't require an extra register.
    // 3) OR appears to be very slightly faster than ADD. (Though, the difference
    // is small enough it might just be measurement noise.)
-  // 4) For the moment, we are using top of stack.  This creates false sharing
-  // with actual stack access/call sequences, and it would be better to use a
-  // location within the redzone.  For the moment, this is still better than an
-  // mfence though.  TODO: Revise the offset used when we can assume a redzone.
+  // 4) When choosing offsets, there are several contributing factors:
+  //   a) If there's no redzone, we default to TOS.  (We could allocate a cache
+  //      line aligned stack object to improve this case.) 
+  //   b) To minimize our chances of introducing a false dependence, we prefer
+  //      to offset the stack usage from TOS slightly.  
+  //   c) To minimize concerns about cross thread stack usage - in particular,
+  //      the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which
+  //      captures state in the TOS frame and accesses it from many threads -
+  //      we want to use an offset such that the offset is in a distinct cache
+  //      line from the TOS frame.
    // 
    // For a general discussion of the tradeoffs and benchmark results, see:
    // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/
  
+  auto &MF = DAG.getMachineFunction();
+  auto &TFL = *Subtarget.getFrameLowering();
+  const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0;
+
    if (Subtarget.is64Bit()) {
      SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
      SDValue Ops[] = {
        DAG.getRegister(X86::RSP, MVT::i64),                  // Base
        DAG.getTargetConstant(1, DL, MVT::i8),                // Scale
        DAG.getRegister(0, MVT::i64),                         // Index
-      DAG.getTargetConstant(0, DL, MVT::i32),               // Disp
+      DAG.getTargetConstant(SPOffset, DL, MVT::i32),        // Disp
        DAG.getRegister(0, MVT::i16),                         // Segment.
        Zero,
        Chain};
@@ -26320,7 +26330,7 @@ static SDValue emitLockedStackOp(SelectionDAG &DAG,
      DAG.getRegister(X86::ESP, MVT::i32),            // Base
      DAG.getTargetConstant(1, DL, MVT::i8),          // Scale
      DAG.getRegister(0, MVT::i32),                   // Index
-    DAG.getTargetConstant(0, DL, MVT::i32),         // Disp
+    DAG.getTargetConstant(SPOffset, DL, MVT::i32),  // Disp
      DAG.getRegister(0, MVT::i16),                   // Segment.
      Zero,
      Chain
diff --git a/test/CodeGen/X86/atomic-idempotent.ll b/test/CodeGen/X86/atomic-idempotent.ll

index 78a31525f734b4fb6515bae14383ab8134a063bb..50c51fc0555c914d9287c30f5141b22b1877fe67 100644 (file)
--- a/test/CodeGen/X86/atomic-idempotent.ll
+++ b/test/CodeGen/X86/atomic-idempotent.ll
@@ -205,7 +205,7 @@ define void @or32_nouse_acq_rel(i32* %p) {
  define void @or32_nouse_seq_cst(i32* %p) {
  ; X64-LABEL: or32_nouse_seq_cst:
  ; X64:       # %bb.0:
-; X64-NEXT:    lock orl $0, (%rsp)
+; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
  ; X64-NEXT:    retq
  ;
  ; X86-LABEL: or32_nouse_seq_cst:
@@ -220,7 +220,7 @@ define void @or32_nouse_seq_cst(i32* %p) {
  define void @or64_nouse_seq_cst(i64* %p) {
  ; X64-LABEL: or64_nouse_seq_cst:
  ; X64:       # %bb.0:
-; X64-NEXT:    lock orl $0, (%rsp)
+; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
  ; X64-NEXT:    retq
  ;
  ; X86-LABEL: or64_nouse_seq_cst:
@@ -294,7 +294,7 @@ define void @or128_nouse_seq_cst(i128* %p) {
  define void @or16_nouse_seq_cst(i16* %p) {
  ; X64-LABEL: or16_nouse_seq_cst:
  ; X64:       # %bb.0:
-; X64-NEXT:    lock orl $0, (%rsp)
+; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
  ; X64-NEXT:    retq
  ;
  ; X86-LABEL: or16_nouse_seq_cst:
@@ -308,7 +308,7 @@ define void @or16_nouse_seq_cst(i16* %p) {
  define void @or8_nouse_seq_cst(i8* %p) {
  ; X64-LABEL: or8_nouse_seq_cst:
  ; X64:       # %bb.0:
-; X64-NEXT:    lock orl $0, (%rsp)
+; X64-NEXT:    lock orl $0, -{{[0-9]+}}(%rsp)
  ; X64-NEXT:    retq
  ;
  ; X86-LABEL: or8_nouse_seq_cst:
diff --git a/test/CodeGen/X86/speculative-load-hardening.ll b/test/CodeGen/X86/speculative-load-hardening.ll

index 5599b88a791fcd4a3e84a89c2dbedf638d0d76a1..f9f623570e0de6747927a045f45be64507be286a 100644 (file)
--- a/test/CodeGen/X86/speculative-load-hardening.ll
+++ b/test/CodeGen/X86/speculative-load-hardening.ll
@@ -1151,14 +1151,14 @@ define void @idempotent_atomic(i32* %x) speculative_load_hardening {
  ; X64-NEXT:    movq %rsp, %rax
  ; X64-NEXT:    movq $-1, %rcx
  ; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:    lock orl $0, (%rsp)
+; X64-NEXT:    lock orl $0, -64(%rsp)
  ; X64-NEXT:    shlq $47, %rax
  ; X64-NEXT:    orq %rax, %rsp
  ; X64-NEXT:    retq
  ;
  ; X64-LFENCE-LABEL: idempotent_atomic:
  ; X64-LFENCE:       # %bb.0:
-; X64-LFENCE-NEXT:    lock orl $0, (%rsp)
+; X64-LFENCE-NEXT:    lock orl $0, -64(%rsp)
  ; X64-LFENCE-NEXT:    retq
    %tmp = atomicrmw or i32* %x, i32 0 seq_cst
    ret void
author	Philip Reames <listmail@philipreames.com>
	Tue, 14 May 2019 22:32:42 +0000 (22:32 +0000)
committer	Philip Reames <listmail@philipreames.com>
	Tue, 14 May 2019 22:32:42 +0000 (22:32 +0000)
lib/Target/X86/X86ISelLowering.cpp		patch \| blob \| history
test/CodeGen/X86/atomic-idempotent.ll		patch \| blob \| history
test/CodeGen/X86/speculative-load-hardening.ll		patch \| blob \| history