From e860308011e813da0e3850089a268df43ab2e396 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@gmail.com>
Date: Mon, 22 Jul 2019 07:19:44 +0000
Subject: [PATCH] [AMDGPU] Save some work when an atomic op has no uses

Summary:
In the atomic optimizer, save doing a bunch of work and generating a
bunch of dead IR in the fairly common case where the result of an
atomic op (i.e. the value that was in memory before the atomic op was
performed) is not used. NFC.

Reviewers: arsenm, dstuttard, tpr

Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, t-tye, hiraditya, jfb, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D64981

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@366667 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 137 ++++++++++----------
 1 file changed, 70 insertions(+), 67 deletions(-)

diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 2982549357b..de974011a91 100644
--- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -491,77 +491,80 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
   // original instruction.
   B.SetInsertPoint(&I);
 
-  // Create a PHI node to get our new atomic result into the exit block.
-  PHINode *const PHI = B.CreatePHI(Ty, 2);
-  PHI->addIncoming(UndefValue::get(Ty), EntryBB);
-  PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
-
-  // We need to broadcast the value who was the lowest active lane (the first
-  // lane) to all other lanes in the wavefront. We use an intrinsic for this,
-  // but have to handle 64-bit broadcasts with two calls to this intrinsic.
-  Value *BroadcastI = nullptr;
-
-  if (TyBitWidth == 64) {
-    Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
-    Value *const ExtractHi =
-        B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
-    CallInst *const ReadFirstLaneLo =
-        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
-    CallInst *const ReadFirstLaneHi =
-        B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
-    Value *const PartialInsert = B.CreateInsertElement(
-        UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
-    Value *const Insert =
-        B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
-    BroadcastI = B.CreateBitCast(Insert, Ty);
-  } else if (TyBitWidth == 32) {
-
-    BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
-  } else {
-    llvm_unreachable("Unhandled atomic bit width");
-  }
+  const bool NeedResult = !I.use_empty();
+  if (NeedResult) {
+    // Create a PHI node to get our new atomic result into the exit block.
+    PHINode *const PHI = B.CreatePHI(Ty, 2);
+    PHI->addIncoming(UndefValue::get(Ty), EntryBB);
+    PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
 
-  // Now that we have the result of our single atomic operation, we need to
-  // get our individual lane's slice into the result. We use the lane offset we
-  // previously calculated combined with the atomic result value we got from the
-  // first lane, to get our lane's index into the atomic result.
-  Value *LaneOffset = nullptr;
-  if (ValDivergent) {
-    LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
-  } else {
-    switch (Op) {
-    default:
-      llvm_unreachable("Unhandled atomic op");
-    case AtomicRMWInst::Add:
-    case AtomicRMWInst::Sub:
-      LaneOffset = B.CreateMul(V, Mbcnt);
-      break;
-    case AtomicRMWInst::And:
-    case AtomicRMWInst::Or:
-    case AtomicRMWInst::Max:
-    case AtomicRMWInst::Min:
-    case AtomicRMWInst::UMax:
-    case AtomicRMWInst::UMin:
-      LaneOffset = B.CreateSelect(Cond, Identity, V);
-      break;
-    case AtomicRMWInst::Xor:
-      LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
-      break;
+    // We need to broadcast the value who was the lowest active lane (the first
+    // lane) to all other lanes in the wavefront. We use an intrinsic for this,
+    // but have to handle 64-bit broadcasts with two calls to this intrinsic.
+    Value *BroadcastI = nullptr;
+
+    if (TyBitWidth == 64) {
+      Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
+      Value *const ExtractHi =
+          B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
+      CallInst *const ReadFirstLaneLo =
+          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
+      CallInst *const ReadFirstLaneHi =
+          B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
+      Value *const PartialInsert = B.CreateInsertElement(
+          UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
+      Value *const Insert =
+          B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
+      BroadcastI = B.CreateBitCast(Insert, Ty);
+    } else if (TyBitWidth == 32) {
+
+      BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
+    } else {
+      llvm_unreachable("Unhandled atomic bit width");
     }
-  }
-  Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
 
-  if (IsPixelShader) {
-    // Need a final PHI to reconverge to above the helper lane branch mask.
-    B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+    // Now that we have the result of our single atomic operation, we need to
+    // get our individual lane's slice into the result. We use the lane offset
+    // we previously calculated combined with the atomic result value we got
+    // from the first lane, to get our lane's index into the atomic result.
+    Value *LaneOffset = nullptr;
+    if (ValDivergent) {
+      LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
+    } else {
+      switch (Op) {
+      default:
+        llvm_unreachable("Unhandled atomic op");
+      case AtomicRMWInst::Add:
+      case AtomicRMWInst::Sub:
+        LaneOffset = B.CreateMul(V, Mbcnt);
+        break;
+      case AtomicRMWInst::And:
+      case AtomicRMWInst::Or:
+      case AtomicRMWInst::Max:
+      case AtomicRMWInst::Min:
+      case AtomicRMWInst::UMax:
+      case AtomicRMWInst::UMin:
+        LaneOffset = B.CreateSelect(Cond, Identity, V);
+        break;
+      case AtomicRMWInst::Xor:
+        LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
+        break;
+      }
+    }
+    Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
 
-    PHINode *const PHI = B.CreatePHI(Ty, 2);
-    PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
-    PHI->addIncoming(Result, I.getParent());
-    I.replaceAllUsesWith(PHI);
-  } else {
-    // Replace the original atomic instruction with the new one.
-    I.replaceAllUsesWith(Result);
+    if (IsPixelShader) {
+      // Need a final PHI to reconverge to above the helper lane branch mask.
+      B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+
+      PHINode *const PHI = B.CreatePHI(Ty, 2);
+      PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
+      PHI->addIncoming(Result, I.getParent());
+      I.replaceAllUsesWith(PHI);
+    } else {
+      // Replace the original atomic instruction with the new one.
+      I.replaceAllUsesWith(Result);
+    }
   }
 
   // And delete the original.
-- 
2.40.0