AMDGPU: Fix insertion point when reducing load intrinsics

author Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 10 Mar 2017 05:25:49 +0000 (05:25 +0000)

committer Matt Arsenault <Matthew.Arsenault@amd.com>

Fri, 10 Mar 2017 05:25:49 +0000 (05:25 +0000)
author Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 10 Mar 2017 05:25:49 +0000 (05:25 +0000)
committer Matt Arsenault <Matthew.Arsenault@amd.com>
Fri, 10 Mar 2017 05:25:49 +0000 (05:25 +0000)
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

index 78ee9d5de3fd1cc7110ce5d9a168baa336a5f05d..843ca7fedfc69a55ba49264b1e7473f93ede5297 100644 (file)
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1584,6 +1584,9 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
        for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
          Args.push_back(II->getArgOperand(I));
  
+      IRBuilderBase::InsertPointGuard Guard(*Builder);
+      Builder->SetInsertPoint(II);
+
        CallInst *NewCall = Builder->CreateCall(NewIntrin, Args);
        NewCall->takeName(II);
        NewCall->copyMetadata(*II);
diff --git a/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll b/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll

index 642f537d1103a1565f672e5d4556d3a27d662a41..888f51bf939dd10d86e36311fb3e96f268fcb9a8 100644 (file)
--- a/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll
+++ b/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll
@@ -266,6 +266,44 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i3
    ret <2 x float> %shuf
  }
  
+; The initial insertion point is at the extractelement
+; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
+; CHECK-NEXT: %1 = shufflevector <2 x float> %tmp, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <2 x double>
+; CHECK-NEXT: %tmp2 = extractelement <2 x double> %tmp1, i32 0
+; CHECK-NEXT: ret double %tmp2
+define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %tmp1 = bitcast <4 x float> %tmp to <2 x double>
+  %tmp2 = extractelement <2 x double> %tmp1, i32 0
+  ret double %tmp2
+}
+
+; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
+; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
+; CHECK-NEXT: ret i32 %tmp2
+define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
+  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
+  ret i32 %tmp2
+}
+
+; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
+; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %tmp, i64 0
+; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <8 x i16>
+; CHECK-NEXT: %tmp2 = extractelement <8 x i16> %tmp1, i32 0
+; CHECK-NEXT: ret i16 %tmp2
+define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %tmp1 = bitcast <4 x float> %tmp to <8 x i16>
+  %tmp2 = extractelement <8 x i16> %tmp1, i32 0
+  ret i16 %tmp2
+}
+
  declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1
  declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1
  declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1
author	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 10 Mar 2017 05:25:49 +0000 (05:25 +0000)
committer	Matt Arsenault <Matthew.Arsenault@amd.com>
	Fri, 10 Mar 2017 05:25:49 +0000 (05:25 +0000)
lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp		patch \| blob \| history
test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll		patch \| blob \| history