From: Sanjay Patel Date: Fri, 4 Jan 2019 17:38:12 +0000 (+0000) Subject: [InstCombine] reduce raw IR narrowing rotate patterns to funnel shift X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=392e9299f53eae2f001ce72a906659817c50cb43;p=llvm [InstCombine] reduce raw IR narrowing rotate patterns to funnel shift Similar to rL350199 - there are no known analysis/codegen holes for funnel shift intrinsics now, so we can canonicalize the 6+ regular instructions to funnel shift to improve vectorization, inlining, unrolling, etc. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@350419 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 7a8c762d494..1201ac196ec 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -492,7 +492,7 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) { } /// Rotate left/right may occur in a wider type than necessary because of type -/// promotion rules. Try to narrow all of the component instructions. +/// promotion rules. Try to narrow the inputs and convert to funnel shift. Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) { assert((isa(Trunc.getSrcTy()) || shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) && @@ -563,23 +563,15 @@ Instruction *InstCombiner::narrowRotate(TruncInst &Trunc) { // We have an unnecessarily wide rotate! // trunc (or (lshr ShVal, ShAmt), (shl ShVal, BitWidth - ShAmt)) - // Narrow it down to eliminate the zext/trunc: - // or (lshr trunc(ShVal), ShAmt0'), (shl trunc(ShVal), ShAmt1') + // Narrow the inputs and convert to funnel shift intrinsic: + // llvm.fshl.i8(trunc(ShVal), trunc(ShVal), trunc(ShAmt)) Value *NarrowShAmt = Builder.CreateTrunc(ShAmt, DestTy); - Value *NegShAmt = Builder.CreateNeg(NarrowShAmt); - - // Mask both shift amounts to ensure there's no UB from oversized shifts. - Constant *MaskC = ConstantInt::get(DestTy, NarrowWidth - 1); - Value *MaskedShAmt = Builder.CreateAnd(NarrowShAmt, MaskC); - Value *MaskedNegShAmt = Builder.CreateAnd(NegShAmt, MaskC); - - // Truncate the original value and use narrow ops. Value *X = Builder.CreateTrunc(ShVal, DestTy); - Value *NarrowShAmt0 = SubIsOnLHS ? MaskedNegShAmt : MaskedShAmt; - Value *NarrowShAmt1 = SubIsOnLHS ? MaskedShAmt : MaskedNegShAmt; - Value *NarrowSh0 = Builder.CreateBinOp(ShiftOpcode0, X, NarrowShAmt0); - Value *NarrowSh1 = Builder.CreateBinOp(ShiftOpcode1, X, NarrowShAmt1); - return BinaryOperator::CreateOr(NarrowSh0, NarrowSh1); + bool IsFshl = (!SubIsOnLHS && ShiftOpcode0 == BinaryOperator::Shl) || + (SubIsOnLHS && ShiftOpcode1 == BinaryOperator::Shl); + Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr; + Function *F = Intrinsic::getDeclaration(Trunc.getModule(), IID, DestTy); + return IntrinsicInst::Create(F, { X, X, NarrowShAmt }); } /// Try to narrow the width of math or bitwise logic instructions by pulling a diff --git a/test/Transforms/InstCombine/rotate.ll b/test/Transforms/InstCombine/rotate.ll index 2da7fb48393..cc6735fe393 100644 --- a/test/Transforms/InstCombine/rotate.ll +++ b/test/Transforms/InstCombine/rotate.ll @@ -353,12 +353,7 @@ define <3 x i16> @rotr_safe_v3i16(<3 x i16> %x, <3 x i16> %y) { define i16 @rotate_left_16bit(i16 %v, i32 %shift) { ; CHECK-LABEL: @rotate_left_16bit( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SHIFT:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15 -; CHECK-NEXT: [[TMP3:%.*]] = sub i16 0, [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = and i16 [[TMP3]], 15 -; CHECK-NEXT: [[TMP5:%.*]] = lshr i16 [[V:%.*]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = shl i16 [[V]], [[TMP2]] -; CHECK-NEXT: [[CONV2:%.*]] = or i16 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[CONV2:%.*]] = call i16 @llvm.fshl.i16(i16 [[V:%.*]], i16 [[V]], i16 [[TMP1]]) ; CHECK-NEXT: ret i16 [[CONV2]] ; %and = and i32 %shift, 15 @@ -376,12 +371,7 @@ define i16 @rotate_left_16bit(i16 %v, i32 %shift) { define <2 x i16> @rotate_left_commute_16bit_vec(<2 x i16> %v, <2 x i32> %shift) { ; CHECK-LABEL: @rotate_left_commute_16bit_vec( ; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i32> [[SHIFT:%.*]] to <2 x i16> -; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i16> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = sub <2 x i16> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = and <2 x i16> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = shl <2 x i16> [[V:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = lshr <2 x i16> [[V]], [[TMP4]] -; CHECK-NEXT: [[CONV2:%.*]] = or <2 x i16> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[CONV2:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[V:%.*]], <2 x i16> [[V]], <2 x i16> [[TMP1]]) ; CHECK-NEXT: ret <2 x i16> [[CONV2]] ; %and = and <2 x i32> %shift, @@ -399,11 +389,7 @@ define <2 x i16> @rotate_left_commute_16bit_vec(<2 x i16> %v, <2 x i32> %shift) define i8 @rotate_right_8bit(i8 %v, i3 %shift) { ; CHECK-LABEL: @rotate_right_8bit( ; CHECK-NEXT: [[TMP1:%.*]] = zext i3 [[SHIFT:%.*]] to i8 -; CHECK-NEXT: [[TMP2:%.*]] = sub i3 0, [[SHIFT]] -; CHECK-NEXT: [[TMP3:%.*]] = zext i3 [[TMP2]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = shl i8 [[V:%.*]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = lshr i8 [[V]], [[TMP1]] -; CHECK-NEXT: [[CONV2:%.*]] = or i8 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[CONV2:%.*]] = call i8 @llvm.fshr.i8(i8 [[V:%.*]], i8 [[V]], i8 [[TMP1]]) ; CHECK-NEXT: ret i8 [[CONV2]] ; %and = zext i3 %shift to i32 @@ -423,12 +409,8 @@ define i8 @rotate_right_commute_8bit(i32 %v, i32 %shift) { ; CHECK-LABEL: @rotate_right_commute_8bit( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SHIFT:%.*]] to i8 ; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw i8 0, [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP3]], 7 -; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[V:%.*]] to i8 -; CHECK-NEXT: [[TMP6:%.*]] = lshr i8 [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = shl i8 [[TMP5]], [[TMP4]] -; CHECK-NEXT: [[CONV2:%.*]] = or i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[V:%.*]] to i8 +; CHECK-NEXT: [[CONV2:%.*]] = call i8 @llvm.fshr.i8(i8 [[TMP3]], i8 [[TMP3]], i8 [[TMP2]]) ; CHECK-NEXT: ret i8 [[CONV2]] ; %and = and i32 %shift, 3 @@ -447,12 +429,7 @@ define i8 @rotate_right_commute_8bit(i32 %v, i32 %shift) { define i8 @rotate8_not_safe(i8 %v, i32 %shamt) { ; CHECK-LABEL: @rotate8_not_safe( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT:%.*]] to i8 -; CHECK-NEXT: [[TMP2:%.*]] = sub i8 0, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP1]], 7 -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP2]], 7 -; CHECK-NEXT: [[TMP5:%.*]] = lshr i8 [[V:%.*]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = shl i8 [[V]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = or i8 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[RET:%.*]] = call i8 @llvm.fshl.i8(i8 [[V:%.*]], i8 [[V]], i8 [[TMP1]]) ; CHECK-NEXT: ret i8 [[RET]] ; %conv = zext i8 %v to i32 @@ -490,12 +467,7 @@ define i9 @rotate9_not_safe(i9 %v, i32 %shamt) { define i16 @rotateleft_16_neg_mask(i16 %v, i16 %shamt) { ; CHECK-LABEL: @rotateleft_16_neg_mask( -; CHECK-NEXT: [[TMP1:%.*]] = sub i16 0, [[SHAMT:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[SHAMT]], 15 -; CHECK-NEXT: [[TMP3:%.*]] = and i16 [[TMP1]], 15 -; CHECK-NEXT: [[TMP4:%.*]] = lshr i16 [[V:%.*]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = shl i16 [[V]], [[TMP2]] -; CHECK-NEXT: [[RET:%.*]] = or i16 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[RET:%.*]] = call i16 @llvm.fshl.i16(i16 [[V:%.*]], i16 [[V]], i16 [[SHAMT:%.*]]) ; CHECK-NEXT: ret i16 [[RET]] ; %neg = sub i16 0, %shamt @@ -513,12 +485,7 @@ define i16 @rotateleft_16_neg_mask(i16 %v, i16 %shamt) { define i16 @rotateleft_16_neg_mask_commute(i16 %v, i16 %shamt) { ; CHECK-LABEL: @rotateleft_16_neg_mask_commute( -; CHECK-NEXT: [[TMP1:%.*]] = sub i16 0, [[SHAMT:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[SHAMT]], 15 -; CHECK-NEXT: [[TMP3:%.*]] = and i16 [[TMP1]], 15 -; CHECK-NEXT: [[TMP4:%.*]] = shl i16 [[V:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = lshr i16 [[V]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = or i16 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[RET:%.*]] = call i16 @llvm.fshl.i16(i16 [[V:%.*]], i16 [[V]], i16 [[SHAMT:%.*]]) ; CHECK-NEXT: ret i16 [[RET]] ; %neg = sub i16 0, %shamt @@ -536,12 +503,7 @@ define i16 @rotateleft_16_neg_mask_commute(i16 %v, i16 %shamt) { define i8 @rotateright_8_neg_mask(i8 %v, i8 %shamt) { ; CHECK-LABEL: @rotateright_8_neg_mask( -; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[SHAMT:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[SHAMT]], 7 -; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP1]], 7 -; CHECK-NEXT: [[TMP4:%.*]] = lshr i8 [[V:%.*]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shl i8 [[V]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = or i8 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[RET:%.*]] = call i8 @llvm.fshr.i8(i8 [[V:%.*]], i8 [[V]], i8 [[SHAMT:%.*]]) ; CHECK-NEXT: ret i8 [[RET]] ; %neg = sub i8 0, %shamt @@ -559,12 +521,7 @@ define i8 @rotateright_8_neg_mask(i8 %v, i8 %shamt) { define i8 @rotateright_8_neg_mask_commute(i8 %v, i8 %shamt) { ; CHECK-LABEL: @rotateright_8_neg_mask_commute( -; CHECK-NEXT: [[TMP1:%.*]] = sub i8 0, [[SHAMT:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[SHAMT]], 7 -; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP1]], 7 -; CHECK-NEXT: [[TMP4:%.*]] = shl i8 [[V:%.*]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = lshr i8 [[V]], [[TMP2]] -; CHECK-NEXT: [[RET:%.*]] = or i8 [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[RET:%.*]] = call i8 @llvm.fshr.i8(i8 [[V:%.*]], i8 [[V]], i8 [[SHAMT:%.*]]) ; CHECK-NEXT: ret i8 [[RET]] ; %neg = sub i8 0, %shamt @@ -586,12 +543,7 @@ define i8 @rotateright_8_neg_mask_commute(i8 %v, i8 %shamt) { define i16 @rotateright_16_neg_mask_wide_amount(i16 %v, i32 %shamt) { ; CHECK-LABEL: @rotateright_16_neg_mask_wide_amount( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = sub i16 0, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i16 [[TMP1]], 15 -; CHECK-NEXT: [[TMP4:%.*]] = and i16 [[TMP2]], 15 -; CHECK-NEXT: [[TMP5:%.*]] = lshr i16 [[V:%.*]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = shl i16 [[V]], [[TMP4]] -; CHECK-NEXT: [[RET:%.*]] = or i16 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[RET:%.*]] = call i16 @llvm.fshr.i16(i16 [[V:%.*]], i16 [[V]], i16 [[TMP1]]) ; CHECK-NEXT: ret i16 [[RET]] ; %neg = sub i32 0, %shamt @@ -608,12 +560,7 @@ define i16 @rotateright_16_neg_mask_wide_amount(i16 %v, i32 %shamt) { define i16 @rotateright_16_neg_mask_wide_amount_commute(i16 %v, i32 %shamt) { ; CHECK-LABEL: @rotateright_16_neg_mask_wide_amount_commute( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = sub i16 0, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i16 [[TMP1]], 15 -; CHECK-NEXT: [[TMP4:%.*]] = and i16 [[TMP2]], 15 -; CHECK-NEXT: [[TMP5:%.*]] = shl i16 [[V:%.*]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = lshr i16 [[V]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = or i16 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[RET:%.*]] = call i16 @llvm.fshr.i16(i16 [[V:%.*]], i16 [[V]], i16 [[TMP1]]) ; CHECK-NEXT: ret i16 [[RET]] ; %neg = sub i32 0, %shamt @@ -630,12 +577,7 @@ define i16 @rotateright_16_neg_mask_wide_amount_commute(i16 %v, i32 %shamt) { define i8 @rotateleft_8_neg_mask_wide_amount(i8 %v, i32 %shamt) { ; CHECK-LABEL: @rotateleft_8_neg_mask_wide_amount( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT:%.*]] to i8 -; CHECK-NEXT: [[TMP2:%.*]] = sub i8 0, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP1]], 7 -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP2]], 7 -; CHECK-NEXT: [[TMP5:%.*]] = lshr i8 [[V:%.*]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = shl i8 [[V]], [[TMP3]] -; CHECK-NEXT: [[RET:%.*]] = or i8 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[RET:%.*]] = call i8 @llvm.fshl.i8(i8 [[V:%.*]], i8 [[V]], i8 [[TMP1]]) ; CHECK-NEXT: ret i8 [[RET]] ; %neg = sub i32 0, %shamt @@ -652,12 +594,7 @@ define i8 @rotateleft_8_neg_mask_wide_amount(i8 %v, i32 %shamt) { define i8 @rotateleft_8_neg_mask_wide_amount_commute(i8 %v, i32 %shamt) { ; CHECK-LABEL: @rotateleft_8_neg_mask_wide_amount_commute( ; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[SHAMT:%.*]] to i8 -; CHECK-NEXT: [[TMP2:%.*]] = sub i8 0, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP1]], 7 -; CHECK-NEXT: [[TMP4:%.*]] = and i8 [[TMP2]], 7 -; CHECK-NEXT: [[TMP5:%.*]] = shl i8 [[V:%.*]], [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = lshr i8 [[V]], [[TMP4]] -; CHECK-NEXT: [[RET:%.*]] = or i8 [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[RET:%.*]] = call i8 @llvm.fshl.i8(i8 [[V:%.*]], i8 [[V]], i8 [[TMP1]]) ; CHECK-NEXT: ret i8 [[RET]] ; %neg = sub i32 0, %shamt