[AMDGPU] Fix for issue in alloca to vector promotion pass

author David Stuttard <david.stuttard@amd.com>

Fri, 9 Jun 2017 14:16:22 +0000 (14:16 +0000)

committer David Stuttard <david.stuttard@amd.com>

Fri, 9 Jun 2017 14:16:22 +0000 (14:16 +0000)
author David Stuttard <david.stuttard@amd.com>
Fri, 9 Jun 2017 14:16:22 +0000 (14:16 +0000)
committer David Stuttard <david.stuttard@amd.com>
Fri, 9 Jun 2017 14:16:22 +0000 (14:16 +0000)
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

index 00fd47f6d7d6163c61f301924e2a7dd75a8610fe..625c9b77e2dec1bf7ea371ef2027e2ee4a33a399 100644 (file)
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -319,15 +319,17 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
    switch (Inst->getOpcode()) {
    case Instruction::Load: {
      LoadInst *LI = cast<LoadInst>(Inst);
-    return !LI->isVolatile();
+    // Currently only handle the case where the Pointer Operand is a GEP so check for that case.
+    return isa<GetElementPtrInst>(LI->getPointerOperand()) && !LI->isVolatile();
    }
    case Instruction::BitCast:
    case Instruction::AddrSpaceCast:
      return true;
    case Instruction::Store: {
-    // Must be the stored pointer operand, not a stored value.
+    // Must be the stored pointer operand, not a stored value, plus
+    // since it should be canonical form, the User should be a GEP.
      StoreInst *SI = cast<StoreInst>(Inst);
-    return (SI->getPointerOperand() == User) && !SI->isVolatile();
+    return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && !SI->isVolatile();
    }
    default:
      return false;
@@ -341,8 +343,11 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
  
    // FIXME: There is no reason why we can't support larger arrays, we
    // are just being conservative for now.
+  // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
+  // could also be promoted but we don't currently handle this case
    if (!AllocaTy ||
        AllocaTy->getElementType()->isVectorTy() ||
+      AllocaTy->getElementType()->isArrayTy() ||
        AllocaTy->getNumElements() > 4 ||
        AllocaTy->getNumElements() < 2) {
      DEBUG(dbgs() << "  Cannot convert type to vector\n");
@@ -390,7 +395,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
      switch (Inst->getOpcode()) {
      case Instruction::Load: {
        Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
-      Value *Ptr = Inst->getOperand(0);
+      Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
        Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
  
        Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
@@ -403,12 +408,13 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
      case Instruction::Store: {
        Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
  
-      Value *Ptr = Inst->getOperand(1);
+      StoreInst *SI = cast<StoreInst>(Inst);
+      Value *Ptr = SI->getPointerOperand();
        Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
        Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
        Value *VecValue = Builder.CreateLoad(BitCast);
        Value *NewVecValue = Builder.CreateInsertElement(VecValue,
-                                                       Inst->getOperand(0),
+                                                       SI->getValueOperand(),
                                                         Index);
        Builder.CreateStore(NewVecValue, BitCast);
        Inst->eraseFromParent();
diff --git a/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll

new file mode 100644 (file)

index 0000000..5b2da78
--- /dev/null
+++ b/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -0,0 +1,131 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-promote-alloca < %s | FileCheck --check-prefix=OPT %s\r
+\r
+; Make sure that array alloca loaded and stored as multi-element aggregates are handled correctly\r
+; Strictly the promote-alloca pass shouldn't have to deal with this case as it is non-canonical, but\r
+; the pass should handle it gracefully if it is\r
+; The checks look for lines that previously caused issues in PromoteAlloca (non-canonical). Opt\r
+; should now leave these unchanged\r
+\r
+; OPT-LABEL: @promote_1d_aggr(\r
+; OPT: store [1 x float] %tmp3, [1 x float]* %f1\r
+\r
+%Block = type { [1 x float], i32 }\r
+%gl_PerVertex = type { <4 x float>, float, [1 x float], [1 x float] }\r
+\r
+@block = external addrspace(1) global %Block\r
+@pv = external addrspace(1) global %gl_PerVertex\r
+\r
+define amdgpu_vs void @promote_1d_aggr() #0 {\r
+  %i = alloca i32\r
+  %f1 = alloca [1 x float]\r
+  %tmp = getelementptr %Block, %Block addrspace(1)* @block, i32 0, i32 1\r
+  %tmp1 = load i32, i32 addrspace(1)* %tmp\r
+  store i32 %tmp1, i32* %i\r
+  %tmp2 = getelementptr %Block, %Block addrspace(1)* @block, i32 0, i32 0\r
+  %tmp3 = load [1 x float], [1 x float] addrspace(1)* %tmp2\r
+  store [1 x float] %tmp3, [1 x float]* %f1\r
+  %tmp4 = load i32, i32* %i\r
+  %tmp5 = getelementptr [1 x float], [1 x float]* %f1, i32 0, i32 %tmp4\r
+  %tmp6 = load float, float* %tmp5\r
+  %tmp7 = alloca <4 x float>\r
+  %tmp8 = load <4 x float>, <4 x float>* %tmp7\r
+  %tmp9 = insertelement <4 x float> %tmp8, float %tmp6, i32 0\r
+  %tmp10 = insertelement <4 x float> %tmp9, float %tmp6, i32 1\r
+  %tmp11 = insertelement <4 x float> %tmp10, float %tmp6, i32 2\r
+  %tmp12 = insertelement <4 x float> %tmp11, float %tmp6, i32 3\r
+  %tmp13 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0\r
+  store <4 x float> %tmp12, <4 x float> addrspace(1)* %tmp13\r
+  ret void\r
+}\r
+\r
+\r
+; OPT-LABEL: @promote_store_aggr(\r
+; OPT: %tmp6 = load [2 x float], [2 x float]* %f1\r
+\r
+%Block2 = type { i32, [2 x float] }\r
+@block2 = external addrspace(1) global %Block2\r
+\r
+define amdgpu_vs void @promote_store_aggr() #0 {\r
+  %i = alloca i32\r
+  %f1 = alloca [2 x float]\r
+  %tmp = getelementptr %Block2, %Block2 addrspace(1)* @block2, i32 0, i32 0\r
+  %tmp1 = load i32, i32 addrspace(1)* %tmp\r
+  store i32 %tmp1, i32* %i\r
+  %tmp2 = load i32, i32* %i\r
+  %tmp3 = sitofp i32 %tmp2 to float\r
+  %tmp4 = getelementptr [2 x float], [2 x float]* %f1, i32 0, i32 0\r
+  store float %tmp3, float* %tmp4\r
+  %tmp5 = getelementptr [2 x float], [2 x float]* %f1, i32 0, i32 1\r
+  store float 2.000000e+00, float* %tmp5\r
+  %tmp6 = load [2 x float], [2 x float]* %f1\r
+  %tmp7 = getelementptr %Block2, %Block2 addrspace(1)* @block2, i32 0, i32 1\r
+  store [2 x float] %tmp6, [2 x float] addrspace(1)* %tmp7\r
+  %tmp8 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0\r
+  store <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, <4 x float> addrspace(1)* %tmp8\r
+  ret void\r
+}\r
+\r
+; OPT-LABEL: @promote_load_from_store_aggr(\r
+; OPT: store [2 x float] %tmp3, [2 x float]* %f1\r
+\r
+%Block3 = type { [2 x float], i32 }\r
+@block3 = external addrspace(1) global %Block3\r
+\r
+define amdgpu_vs void @promote_load_from_store_aggr() #0 {\r
+  %i = alloca i32\r
+  %f1 = alloca [2 x float]\r
+  %tmp = getelementptr %Block3, %Block3 addrspace(1)* @block3, i32 0, i32 1\r
+  %tmp1 = load i32, i32 addrspace(1)* %tmp\r
+  store i32 %tmp1, i32* %i\r
+  %tmp2 = getelementptr %Block3, %Block3 addrspace(1)* @block3, i32 0, i32 0\r
+  %tmp3 = load [2 x float], [2 x float] addrspace(1)* %tmp2\r
+  store [2 x float] %tmp3, [2 x float]* %f1\r
+  %tmp4 = load i32, i32* %i\r
+  %tmp5 = getelementptr [2 x float], [2 x float]* %f1, i32 0, i32 %tmp4\r
+  %tmp6 = load float, float* %tmp5\r
+  %tmp7 = alloca <4 x float>\r
+  %tmp8 = load <4 x float>, <4 x float>* %tmp7\r
+  %tmp9 = insertelement <4 x float> %tmp8, float %tmp6, i32 0\r
+  %tmp10 = insertelement <4 x float> %tmp9, float %tmp6, i32 1\r
+  %tmp11 = insertelement <4 x float> %tmp10, float %tmp6, i32 2\r
+  %tmp12 = insertelement <4 x float> %tmp11, float %tmp6, i32 3\r
+  %tmp13 = getelementptr %gl_PerVertex, %gl_PerVertex addrspace(1)* @pv, i32 0, i32 0\r
+  store <4 x float> %tmp12, <4 x float> addrspace(1)* %tmp13\r
+  ret void\r
+}\r
+\r
+; OPT-LABEL: @promote_double_aggr(\r
+; OPT: store [2 x double] %tmp5, [2 x double]* %s\r
+\r
+@tmp_g = external addrspace(1) global { [4 x double], <2 x double>, <3 x double>, <4 x double> }\r
+@frag_color = external addrspace(1) global <4 x float>\r
+\r
+define amdgpu_ps void @promote_double_aggr() #0 {\r
+  %s = alloca [2 x double]\r
+  %tmp = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 0\r
+  %tmp1 = load double, double addrspace(1)* %tmp\r
+  %tmp2 = getelementptr { [4 x double], <2 x double>, <3 x double>, <4 x double> }, { [4 x double], <2 x double>, <3 x double>, <4 x double> } addrspace(1)* @tmp_g, i32 0, i32 0, i32 1\r
+  %tmp3 = load double, double addrspace(1)* %tmp2\r
+  %tmp4 = insertvalue [2 x double] undef, double %tmp1, 0\r
+  %tmp5 = insertvalue [2 x double] %tmp4, double %tmp3, 1\r
+  store [2 x double] %tmp5, [2 x double]* %s\r
+  %tmp6 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 1\r
+  %tmp7 = load double, double* %tmp6\r
+  %tmp8 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 1\r
+  %tmp9 = load double, double* %tmp8\r
+  %tmp10 = fadd double %tmp7, %tmp9\r
+  %tmp11 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 0\r
+  store double %tmp10, double* %tmp11\r
+  %tmp12 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 0\r
+  %tmp13 = load double, double* %tmp12\r
+  %tmp14 = getelementptr [2 x double], [2 x double]* %s, i32 0, i32 1\r
+  %tmp15 = load double, double* %tmp14\r
+  %tmp16 = fadd double %tmp13, %tmp15\r
+  %tmp17 = fptrunc double %tmp16 to float\r
+  %tmp18 = insertelement <4 x float> undef, float %tmp17, i32 0\r
+  %tmp19 = insertelement <4 x float> %tmp18, float %tmp17, i32 1\r
+  %tmp20 = insertelement <4 x float> %tmp19, float %tmp17, i32 2\r
+  %tmp21 = insertelement <4 x float> %tmp20, float %tmp17, i32 3\r
+  store <4 x float> %tmp21, <4 x float> addrspace(1)* @frag_color\r
+  ret void\r
+}\r
author	David Stuttard <david.stuttard@amd.com>
	Fri, 9 Jun 2017 14:16:22 +0000 (14:16 +0000)
committer	David Stuttard <david.stuttard@amd.com>
	Fri, 9 Jun 2017 14:16:22 +0000 (14:16 +0000)
lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp		patch \| blob \| history
test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll	[new file with mode: 0644]	patch \| blob