From: Krzysztof Parzyszek Date: Fri, 24 Feb 2017 23:34:24 +0000 (+0000) Subject: [Hexagon] Undo shift folding where it could simplify addressing mode X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=cdc2ace6920c405510e289b763c67482d396d901;p=llvm [Hexagon] Undo shift folding where it could simplify addressing mode For example, avoid (single shift): r0 = and(##536870908,lsr(r0,#3)) r0 = memw(r1+r0<<#0) in favor of (two shifts): r0 = lsr(r0,#5) r0 = memw(r1+r0<<#2) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296196 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index 3a284622db3..06854ba5e09 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -1023,8 +1023,8 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() { } } - // Transform: (store ch addr (add x (add (shl y c) e))) - // to: (store ch addr (add x (shl (add y d) c))), + // Transform: (store ch val (add x (add (shl y c) e))) + // to: (store ch val (add x (shl (add y d) c))), // where e = (shl d c) for some integer d. // The purpose of this is to enable generation of loads/stores with // shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift @@ -1033,7 +1033,7 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() { if (I->getOpcode() != ISD::STORE) continue; - // I matched: (store ch addr Off) + // I matched: (store ch val Off) SDValue Off = I->getOperand(2); // Off needs to match: (add x (add (shl y c) (shl d c)))) if (Off.getOpcode() != ISD::ADD) @@ -1076,6 +1076,78 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() { ReplaceNode(T0.getNode(), NewShl.getNode()); } + // Transform (load ch (add x (and (srl y c) Mask))) + // to: (load ch (add x (shl (srl y d) d-c))) + // where + // Mask = 00..0 111..1 0.0 + // | | +-- d-c 0s, and d-c is 0, 1 or 2. + // | +-------- 1s + // +-------------- at most c 0s + // Motivating example: + // DAG combiner optimizes (add x (shl (srl y 5) 2)) + // to (add x (and (srl y 3) 1FFFFFFC)) + // which results in a constant-extended and(##...,lsr). This transformation + // undoes this simplification for cases where the shl can be folded into + // an addressing mode. + for (SDNode *N : Nodes) { + unsigned Opc = N->getOpcode(); + if (Opc != ISD::LOAD && Opc != ISD::STORE) + continue; + SDValue Addr = Opc == ISD::LOAD ? N->getOperand(1) : N->getOperand(2); + // Addr must match: (add x T0) + if (Addr.getOpcode() != ISD::ADD) + continue; + SDValue T0 = Addr.getOperand(1); + // T0 must match: (and T1 Mask) + if (T0.getOpcode() != ISD::AND) + continue; + + // We have an AND. + // + // Check the first operand. It must be: (srl y c). + SDValue S = T0.getOperand(0); + if (S.getOpcode() != ISD::SRL) + continue; + ConstantSDNode *SN = dyn_cast(S.getOperand(1).getNode()); + if (SN == nullptr) + continue; + if (SN->getAPIntValue().getBitWidth() != 32) + continue; + uint32_t CV = SN->getZExtValue(); + + // Check the second operand: the supposed mask. + ConstantSDNode *MN = dyn_cast(T0.getOperand(1).getNode()); + if (MN == nullptr) + continue; + if (MN->getAPIntValue().getBitWidth() != 32) + continue; + uint32_t Mask = MN->getZExtValue(); + // Examine the mask. + uint32_t TZ = countTrailingZeros(Mask); + uint32_t M1 = countTrailingOnes(Mask >> TZ); + uint32_t LZ = countLeadingZeros(Mask); + // Trailing zeros + middle ones + leading zeros must equal the width. + if (TZ + M1 + LZ != 32) + continue; + // The number of trailing zeros will be encoded in the addressing mode. + if (TZ > 2) + continue; + // The number of leading zeros must be at most c. + if (LZ > CV) + continue; + + // All looks good. + SDValue Y = S.getOperand(0); + EVT VT = Addr.getValueType(); + SDLoc dl(S); + // TZ = D-C, so D = TZ+C. + SDValue D = DAG.getConstant(TZ+CV, dl, VT); + SDValue DC = DAG.getConstant(TZ, dl, VT); + SDValue NewSrl = DAG.getNode(ISD::SRL, dl, VT, Y, D); + SDValue NewShl = DAG.getNode(ISD::SHL, dl, VT, NewSrl, DC); + ReplaceNode(T0.getNode(), NewShl.getNode()); + } + if (EnableAddressRebalancing) { rebalanceAddressTrees(); diff --git a/test/CodeGen/Hexagon/undo-dag-shift.ll b/test/CodeGen/Hexagon/undo-dag-shift.ll new file mode 100644 index 00000000000..c1ab5d73f5c --- /dev/null +++ b/test/CodeGen/Hexagon/undo-dag-shift.ll @@ -0,0 +1,59 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; DAG combiner folds sequences of shifts, which can sometimes obscure +; optimization opportunities. For example +; +; unsigned int c(unsigned int b, unsigned int *a) { +; unsigned int bitidx = b >> 5; +; return a[bitidx]; +; } +; +; produces +; (add x (shl (srl y 5) 2)) +; which is then folded into +; (add x (and (srl y 3) 1FFFFFFC)) +; +; That results in a constant-extended and: +; r0 = and(##536870908,lsr(r0,#3)) +; r0 = memw(r1+r0<<#0) +; whereas +; r0 = lsr(r0,#5) +; r0 = memw(r1+r0<<#2) +; is more desirable. + +target triple = "hexagon" + +; CHECK-LABEL: load_0 +; CHECK: memw(r{{[0-9]+}}+r{{[0-9]}}<<#2) +define i32 @load_0(i32 %b, i32* nocapture readonly %a) #0 { +entry: + %shr = lshr i32 %b, 5 + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %shr + %0 = load i32, i32* %arrayidx, align 4 + ret i32 %0 +} + +; This would require r0<<#3, which is not legal. +; CHECK-LABEL: load_1 +; CHECK: memw(r{{[0-9]+}}+r{{[0-9]}}<<#0) +define i32 @load_1(i32 %b, [3 x i32]* nocapture readonly %a) #0 { +entry: + %shr = lshr i32 %b, 5 + %arrayidx = getelementptr inbounds [3 x i32], [3 x i32]* %a, i32 %shr, i32 0 + %0 = load i32, i32* %arrayidx, align 4 + ret i32 %0 +} + +; CHECK-LABEL: store_0 +; CHECK: memw(r{{[0-9]+}}+r{{[0-9]}}<<#2) +define void @store_0(i32 %b, i32* nocapture %a, i32 %v) #1 { +entry: + %shr = lshr i32 %b, 5 + %arrayidx = getelementptr inbounds i32, i32* %a, i32 %shr + store i32 %v, i32* %arrayidx, align 4 + ret void +} + +attributes #0 = { norecurse nounwind readonly "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" } +attributes #1 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" } +