From: Nikita Popov Date: Sun, 7 Apr 2019 17:22:16 +0000 (+0000) Subject: Reapply [ValueTracking] Support min/max selects in computeConstantRange() X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=cdee3b7f234ddf811f88007fd6c082bba38a5bd3;p=llvm Reapply [ValueTracking] Support min/max selects in computeConstantRange() Add support for min/max flavor selects in computeConstantRange(), which allows us to fold comparisons of a min/max against a constant in InstSimplify. This fixes an infinite InstCombine loop, with the test case taken from D59378. Relative to the previous iteration, this contains some adjustments for AMDGPU med3 tests: The AMDGPU target runs InstSimplify prior to codegen, which ends up constant folding some existing med3 tests after this change. To preserve these tests a hidden -amdgpu-scalar-ir-passes option is added, which allows disabling scalar IR passes (that use InstSimplify) for testing purposes. Differential Revision: https://reviews.llvm.org/D59506 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@357870 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index d3cbd07d646..e18679cf7bb 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -5689,7 +5689,28 @@ static void setLimitsForSelectPattern(const SelectInst &SI, APInt &Lower, return; } - // TODO Handle min/max flavors. + const APInt *C; + if (!match(LHS, m_APInt(C)) && !match(RHS, m_APInt(C))) + return; + + switch (R.Flavor) { + case SPF_UMIN: + Upper = *C + 1; + break; + case SPF_UMAX: + Lower = *C; + break; + case SPF_SMIN: + Lower = APInt::getSignedMinValue(BitWidth); + Upper = *C + 1; + break; + case SPF_SMAX: + Lower = *C; + Upper = APInt::getSignedMaxValue(BitWidth) + 1; + break; + default: + break; + } } ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo) { diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 3bfeea197bd..8dd467b8d96 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -169,6 +169,12 @@ EnableDCEInRA("amdgpu-dce-in-ra", cl::init(true), cl::Hidden, cl::desc("Enable machine DCE inside regalloc")); +static cl::opt EnableScalarIRPasses( + "amdgpu-scalar-ir-passes", + cl::desc("Enable scalar IR passes"), + cl::init(true), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -670,7 +676,8 @@ void AMDGPUPassConfig::addIRPasses() { if (EnableSROA) addPass(createSROAPass()); - addStraightLineScalarOptimizationPasses(); + if (EnableScalarIRPasses) + addStraightLineScalarOptimizationPasses(); if (EnableAMDGPUAliasAnalysis) { addPass(createAMDGPUAAWrapperPass()); @@ -696,7 +703,7 @@ void AMDGPUPassConfig::addIRPasses() { // %1 = shl %a, 2 // // but EarlyCSE can do neither of them. - if (getOptLevel() != CodeGenOpt::None) + if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses) addEarlyCSEOrGVNPass(); } diff --git a/test/CodeGen/AMDGPU/med3-no-simplify.ll b/test/CodeGen/AMDGPU/med3-no-simplify.ll new file mode 100644 index 00000000000..0d00c9a5e8e --- /dev/null +++ b/test/CodeGen/AMDGPU/med3-no-simplify.ll @@ -0,0 +1,48 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-scalar-ir-passes=false < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -amdgpu-scalar-ir-passes=false < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -amdgpu-scalar-ir-passes=false < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s + +; These tests are split out from umed3.ll and smed3.ll and use the +; -amdgpu-scalar-ir-passes=false flag, because InstSimplify would constant +; fold these functions otherwise. + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + +; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32: +; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} +; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} +define amdgpu_kernel void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp ugt i32 %a, 17 + %i0 = select i1 %icmp0, i32 %a, i32 17 + + %icmp1 = icmp ult i32 %i0, 12 + %i1 = select i1 %icmp1, i32 %i0, i32 12 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + +; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32: +; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} +; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} +define amdgpu_kernel void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid + %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %gep0 + + %icmp0 = icmp sgt i32 %a, 17 + %i0 = select i1 %icmp0, i32 %a, i32 17 + + %icmp1 = icmp slt i32 %i0, 12 + %i1 = select i1 %icmp1, i32 %i0, i32 12 + + store i32 %i1, i32 addrspace(1)* %outgep + ret void +} + diff --git a/test/CodeGen/AMDGPU/smed3.ll b/test/CodeGen/AMDGPU/smed3.ll index 22c3b2d42f3..16aff2edba9 100644 --- a/test/CodeGen/AMDGPU/smed3.ll +++ b/test/CodeGen/AMDGPU/smed3.ll @@ -42,25 +42,6 @@ define amdgpu_kernel void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %o ret void } -; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32: -; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} -; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} -define amdgpu_kernel void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0 - - %icmp0 = icmp sgt i32 %a, 17 - %i0 = select i1 %icmp0, i32 %a, i32 17 - - %icmp1 = icmp slt i32 %i0, 12 - %i1 = select i1 %icmp1, i32 %i0, i32 12 - - store i32 %i1, i32 addrspace(1)* %outgep - ret void -} - ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_sign_mismatch_i32: ; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} ; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} diff --git a/test/CodeGen/AMDGPU/umed3.ll b/test/CodeGen/AMDGPU/umed3.ll index 2f6685921df..7c2a3414132 100644 --- a/test/CodeGen/AMDGPU/umed3.ll +++ b/test/CodeGen/AMDGPU/umed3.ll @@ -42,25 +42,6 @@ define amdgpu_kernel void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %o ret void } -; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32: -; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} -; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} -define amdgpu_kernel void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() - %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid - %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid - %a = load i32, i32 addrspace(1)* %gep0 - - %icmp0 = icmp ugt i32 %a, 17 - %i0 = select i1 %icmp0, i32 %a, i32 17 - - %icmp1 = icmp ult i32 %i0, 12 - %i1 = select i1 %icmp1, i32 %i0, i32 12 - - store i32 %i1, i32 addrspace(1)* %outgep - ret void -} - ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32: ; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}} ; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}} diff --git a/test/Transforms/InstCombine/minmax-fold.ll b/test/Transforms/InstCombine/minmax-fold.ll index f68b62ad7d0..8588b36fa65 100644 --- a/test/Transforms/InstCombine/minmax-fold.ll +++ b/test/Transforms/InstCombine/minmax-fold.ll @@ -533,6 +533,37 @@ define i32 @clamp_check_for_no_infinite_loop2(i32 %i) { ret i32 %res } +; Check that there is no infinite loop because of reverse cmp transformation: +; (icmp slt smax(PositiveA, B) 2) -> (icmp eq B 1) +define i32 @clamp_check_for_no_infinite_loop3(i32 %i) { +; CHECK-LABEL: @clamp_check_for_no_infinite_loop3( +; CHECK-NEXT: [[I2:%.*]] = icmp sgt i32 [[I:%.*]], 1 +; CHECK-NEXT: [[I3:%.*]] = select i1 [[I2]], i32 [[I]], i32 1 +; CHECK-NEXT: br i1 true, label [[TRUELABEL:%.*]], label [[FALSELABEL:%.*]] +; CHECK: truelabel: +; CHECK-NEXT: [[I5:%.*]] = icmp slt i32 [[I3]], 2 +; CHECK-NEXT: [[I6:%.*]] = select i1 [[I5]], i32 [[I3]], i32 2 +; CHECK-NEXT: [[I7:%.*]] = shl nuw nsw i32 [[I6]], 2 +; CHECK-NEXT: ret i32 [[I7]] +; CHECK: falselabel: +; CHECK-NEXT: ret i32 0 +; + + %i2 = icmp sgt i32 %i, 1 + %i3 = select i1 %i2, i32 %i, i32 1 + %i4 = icmp sgt i32 %i3, 0 + br i1 %i4, label %truelabel, label %falselabel + +truelabel: ; %i<=1, %i3>0 + %i5 = icmp slt i32 %i3, 2 + %i6 = select i1 %i5, i32 %i3, i32 2 + %i7 = shl nuw nsw i32 %i6, 2 + ret i32 %i7 + +falselabel: + ret i32 0 +} + ; The next 3 min tests should canonicalize to the same form...and not infinite loop. define double @PR31751_umin1(i32 %x) { diff --git a/test/Transforms/InstCombine/sub.ll b/test/Transforms/InstCombine/sub.ll index 2a9aa09cbbd..e30e39a84e9 100644 --- a/test/Transforms/InstCombine/sub.ll +++ b/test/Transforms/InstCombine/sub.ll @@ -1213,7 +1213,7 @@ define i32 @test66(i32 %x) { ; CHECK-LABEL: @test66( ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[X:%.*]], -101 ; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i32 [[X]], i32 -101 -; CHECK-NEXT: [[RES:%.*]] = add i32 [[TMP2]], 1 +; CHECK-NEXT: [[RES:%.*]] = add nuw i32 [[TMP2]], 1 ; CHECK-NEXT: ret i32 [[RES]] ; %1 = xor i32 %x, -1 diff --git a/test/Transforms/InstSimplify/cmp_of_min_max.ll b/test/Transforms/InstSimplify/cmp_of_min_max.ll index 4726609c3f5..34c4a15aed1 100644 --- a/test/Transforms/InstSimplify/cmp_of_min_max.ll +++ b/test/Transforms/InstSimplify/cmp_of_min_max.ll @@ -3,10 +3,7 @@ define i1 @test_umax1(i32 %n) { ; CHECK-LABEL: @test_umax1( -; CHECK-NEXT: [[C1:%.*]] = icmp ugt i32 [[N:%.*]], 10 -; CHECK-NEXT: [[S:%.*]] = select i1 [[C1]], i32 [[N]], i32 10 -; CHECK-NEXT: [[C2:%.*]] = icmp ugt i32 [[S]], 9 -; CHECK-NEXT: ret i1 [[C2]] +; CHECK-NEXT: ret i1 true ; %c1 = icmp ugt i32 %n, 10 %s = select i1 %c1, i32 %n, i32 10 @@ -40,10 +37,7 @@ define i1 @test_umax3(i32 %n) { define i1 @test_umin1(i32 %n) { ; CHECK-LABEL: @test_umin1( -; CHECK-NEXT: [[C1:%.*]] = icmp ult i32 [[N:%.*]], 10 -; CHECK-NEXT: [[S:%.*]] = select i1 [[C1]], i32 [[N]], i32 10 -; CHECK-NEXT: [[C2:%.*]] = icmp ult i32 [[S]], 11 -; CHECK-NEXT: ret i1 [[C2]] +; CHECK-NEXT: ret i1 true ; %c1 = icmp ult i32 %n, 10 %s = select i1 %c1, i32 %n, i32 10 @@ -77,10 +71,7 @@ define i1 @test_umin3(i32 %n) { define i1 @test_smax1(i32 %n) { ; CHECK-LABEL: @test_smax1( -; CHECK-NEXT: [[C1:%.*]] = icmp sgt i32 [[N:%.*]], -10 -; CHECK-NEXT: [[S:%.*]] = select i1 [[C1]], i32 [[N]], i32 -10 -; CHECK-NEXT: [[C2:%.*]] = icmp sgt i32 [[S]], -11 -; CHECK-NEXT: ret i1 [[C2]] +; CHECK-NEXT: ret i1 true ; %c1 = icmp sgt i32 %n, -10 %s = select i1 %c1, i32 %n, i32 -10 @@ -114,10 +105,7 @@ define i1 @test_smax3(i32 %n) { define i1 @test_smin1(i32 %n) { ; CHECK-LABEL: @test_smin1( -; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[N:%.*]], 10 -; CHECK-NEXT: [[S:%.*]] = select i1 [[C1]], i32 [[N]], i32 10 -; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[S]], 11 -; CHECK-NEXT: ret i1 [[C2]] +; CHECK-NEXT: ret i1 true ; %c1 = icmp slt i32 %n, 10 %s = select i1 %c1, i32 %n, i32 10