LSR: Check more intrinsic pointer operands

author Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 11 Dec 2017 21:38:43 +0000 (21:38 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Mon, 11 Dec 2017 21:38:43 +0000 (21:38 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 11 Dec 2017 21:38:43 +0000 (21:38 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Mon, 11 Dec 2017 21:38:43 +0000 (21:38 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

index 153a4a8ddb7e3fea889aa3b556ec1cb398bf0d75..77c2d4b956c6662ee513d693fac96cbce0644010 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -288,6 +288,32 @@ unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
    return 8;
  }
  
+bool AMDGPUTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
+                                       MemIntrinsicInfo &Info) const {
+  switch (Inst->getIntrinsicID()) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec: {
+    auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
+    auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
+    if (!Ordering || !Volatile)
+      return false; // Invalid.
+
+    unsigned OrderingVal = Ordering->getZExtValue();
+    if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
+      return false;
+
+    Info.PtrVal = Inst->getArgOperand(0);
+    Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
+    Info.ReadMem = true;
+    Info.WriteMem = true;
+    Info.IsVolatile = !Volatile->isNullValue();
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
  int AMDGPUTTIImpl::getArithmeticInstrCost(
      unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
      TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

index ee0683d39b49913bfa064c5900e72c9f0f0ac4a7..8899d2c6da8a3ba933a6445b16d6427ed17f0c1e 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -132,6 +132,8 @@ public:
  
    unsigned getMaxInterleaveFactor(unsigned VF);
  
+  bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
+
    int getArithmeticInstrCost(
      unsigned Opcode, Type *Ty,
      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp

index a161c839b8d8ac8af38e61ac10a36e5fb0ac2c6f..953854c8b7b740d210d741bf1b6aed98722f7942 100644 (file)
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -777,7 +777,8 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
  
  /// Returns true if the specified instruction is using the specified value as an
  /// address.
-static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
+static bool isAddressUse(const TargetTransformInfo &TTI,
+                         Instruction *Inst, Value *OperandVal) {
    bool isAddress = isa<LoadInst>(Inst);
    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
      if (SI->getPointerOperand() == OperandVal)
@@ -786,18 +787,24 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
      // Addressing modes can also be folded into prefetches and a variety
      // of intrinsics.
      switch (II->getIntrinsicID()) {
-      default: break;
-      case Intrinsic::memset:
-      case Intrinsic::prefetch:
-        if (II->getArgOperand(0) == OperandVal)
-          isAddress = true;
-        break;
-      case Intrinsic::memmove:
-      case Intrinsic::memcpy:
-        if (II->getArgOperand(0) == OperandVal ||
-            II->getArgOperand(1) == OperandVal)
+    case Intrinsic::memset:
+    case Intrinsic::prefetch:
+      if (II->getArgOperand(0) == OperandVal)
+        isAddress = true;
+      break;
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      if (II->getArgOperand(0) == OperandVal ||
+          II->getArgOperand(1) == OperandVal)
+        isAddress = true;
+      break;
+    default: {
+      MemIntrinsicInfo IntrInfo;
+      if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
+        if (IntrInfo.PtrVal == OperandVal)
            isAddress = true;
-        break;
+      }
+    }
      }
    } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
      if (RMW->getPointerOperand() == OperandVal)
@@ -810,7 +817,8 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
  }
  
  /// Return the type of the memory being accessed.
-static MemAccessTy getAccessType(const Instruction *Inst) {
+static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
+                                 Instruction *Inst) {
    MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
    if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
      AccessTy.MemTy = SI->getOperand(0)->getType();
@@ -821,6 +829,21 @@ static MemAccessTy getAccessType(const Instruction *Inst) {
      AccessTy.AddrSpace = RMW->getPointerAddressSpace();
    } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
      AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::prefetch:
+      AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
+      break;
+    default: {
+      MemIntrinsicInfo IntrInfo;
+      if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
+        AccessTy.AddrSpace
+          = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
+      }
+
+      break;
+    }
+    }
    }
  
    // All pointers have the same requirements, so canonicalize them to an
@@ -1025,7 +1048,7 @@ private:
                             ScalarEvolution &SE, DominatorTree &DT,
                             SmallPtrSetImpl<const SCEV *> *LoserRegs);
  };
-  
+
  /// An operand value in an instruction which is to be replaced with some
  /// equivalent, possibly strength-reduced, replacement.
  struct LSRFixup {
@@ -1149,7 +1172,7 @@ public:
      if (f.Offset < MinOffset)
        MinOffset = f.Offset;
    }
-  
+
    bool HasFormulaWithSameRegs(const Formula &F) const;
    float getNotSelectedProbability(const SCEV *Reg) const;
    bool InsertFormula(const Formula &F, const Loop &L);
@@ -2362,7 +2385,7 @@ LSRInstance::OptimizeLoopTermCond() {
                  C->getValue().isMinSignedValue())
                goto decline_post_inc;
              // Check for possible scaled-address reuse.
-            MemAccessTy AccessTy = getAccessType(UI->getUser());
+            MemAccessTy AccessTy = getAccessType(TTI, UI->getUser());
              int64_t Scale = C->getSExtValue();
              if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
                                            /*BaseOffset=*/0,
@@ -3032,13 +3055,13 @@ void LSRInstance::FinalizeChain(IVChain &Chain) {
  static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
                               Value *Operand, const TargetTransformInfo &TTI) {
    const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
-  if (!IncConst || !isAddressUse(UserInst, Operand))
+  if (!IncConst || !isAddressUse(TTI, UserInst, Operand))
      return false;
  
    if (IncConst->getAPInt().getMinSignedBits() > 64)
      return false;
  
-  MemAccessTy AccessTy = getAccessType(UserInst);
+  MemAccessTy AccessTy = getAccessType(TTI, UserInst);
    int64_t IncOffset = IncConst->getValue()->getSExtValue();
    if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
                          IncOffset, /*HaseBaseReg=*/false))
@@ -3165,14 +3188,14 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
  
      LSRUse::KindType Kind = LSRUse::Basic;
      MemAccessTy AccessTy;
-    if (isAddressUse(UserInst, U.getOperandValToReplace())) {
+    if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
        Kind = LSRUse::Address;
-      AccessTy = getAccessType(UserInst);
+      AccessTy = getAccessType(TTI, UserInst);
      }
  
      const SCEV *S = IU.getExpr(U);
      PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
-    
+
      // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
      // (N - i == 0), and this allows (N - i) to be the expression that we work
      // with rather than just N or i, so we can consider the register
@@ -4304,7 +4327,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
          LUThatHas->pushFixup(Fixup);
          DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
        }
-      
+
        // Delete formulae from the new use which are no longer legal.
        bool Any = false;
        for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll

index 36eade395b79993ecda726a05c1c0e00cc88c0fa..75ce7f54ae39c80bc76ae21bf1cc5d08e9f09a60 100644 (file)
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -407,6 +407,19 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace
    ret void
  }
  
+; GCN-LABEL: {{^}}nocse_lds_atomic_inc_ret_i32:
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
+; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
+define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(3)* %ptr) #0 {
+  %result0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
+  %result1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
+
+  store i32 %result0, i32 addrspace(1)* %out0
+  store i32 %result1, i32 addrspace(1)* %out1
+  ret void
+}
+
  attributes #0 = { nounwind }
  attributes #1 = { nounwind readnone }
  attributes #2 = { nounwind argmemonly }
diff --git a/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll b/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll

index 054c61d187958b6a4370bb0e5b6dba646a801637..5d53b1b89da7cdd2356f90bf25a2cc049b6d3fe5 100644 (file)
--- a/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
+++ b/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
@@ -84,4 +84,84 @@ bb:
    br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
  }
  
-attributes #0 = { nounwind }
-\ No newline at end of file
+; OPT-LABEL: @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(
+; OPT-NOT: getelementptr
+
+; OPT: .lr.ph:
+; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
+; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
+; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
+; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %scevgep4, i32 undef, i32 0, i32 0, i1 false)
+; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @test_local_atomicinc_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+  %tmp = icmp sgt i32 %n, 0
+  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader:                                 ; preds = %bb
+  br label %.lr.ph
+
+._crit_edge.loopexit:                             ; preds = %.lr.ph
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
+  ret void
+
+.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %tmp1 = add nuw nsw i32 %indvars.iv, 16383
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
+  %tmp4 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef, i32 0, i32 0, i1 false)
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
+  %tmp7 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef, i32 0, i32 0, i1 false)
+  %tmp8 = add nsw i32 %tmp7, %tmp4
+  atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+; OPT-LABEL: @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(
+; OPT-NOT: getelementptr
+
+; OPT: .lr.ph:
+; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
+; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
+; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
+; OPT: %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %scevgep4, i32 undef, i32 0, i32 0, i1 false)
+; OPT: %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %lsr.iv1, i32 undef, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @test_local_atomicdec_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+  %tmp = icmp sgt i32 %n, 0
+  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader:                                 ; preds = %bb
+  br label %.lr.ph
+
+._crit_edge.loopexit:                             ; preds = %.lr.ph
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
+  ret void
+
+.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %tmp1 = add nuw nsw i32 %indvars.iv, 16383
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
+  %tmp4 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp3, i32 undef, i32 0, i32 0, i1 false)
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
+  %tmp7 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %tmp6, i32 undef, i32 0, i32 0, i1 false)
+  %tmp8 = add nsw i32 %tmp7, %tmp4
+  atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #1
+declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind argmemonly }
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 11 Dec 2017 21:38:43 +0000 (21:38 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Mon, 11 Dec 2017 21:38:43 +0000 (21:38 +0000)
lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp		patch \| blob \| history
lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h		patch \| blob \| history
lib/Transforms/Scalar/LoopStrengthReduce.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll		patch \| blob \| history
test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll		patch \| blob \| history