From ec070e1a4527abe7a7b561def097c7e2317fdc72 Mon Sep 17 00:00:00 2001 From: Bjorn Pettersson Date: Tue, 3 Sep 2019 19:35:07 +0000 Subject: [PATCH] [CodeGen] Use FSHR in DAGTypeLegalizer::ExpandIntRes_MULFIX Summary: Simplify the right shift of the intermediate result (given in four parts) by using funnel shift. There are some impact on lit tests, but that seems to be related to register allocation differences due to how FSHR is expanded on X86 (giving a slightly different operand order for the OR operations compared to the old code). Reviewers: leonardchan, RKSimon, spatel, lebedev.ri Reviewed By: RKSimon Subscribers: hiraditya, asb, rbar, johnrusso, simoncook, apazos, sabuasal, niosHD, jrtc27, MaskRay, zzheng, edward-jones, rogfer01, MartinMosbeck, brucehoult, the_o, PkmX, jocewei, s.egerton, pzheng, bevinh, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D67036 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@370813 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../SelectionDAG/LegalizeIntegerTypes.cpp | 68 +++------- test/CodeGen/RISCV/addcarry.ll | 4 +- test/CodeGen/X86/smul_fix.ll | 15 +- test/CodeGen/X86/smul_fix_sat.ll | 128 +++++++++--------- test/CodeGen/X86/umul_fix.ll | 20 +-- 5 files changed, 99 insertions(+), 136 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 5db61436129..83d4249fcac 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2857,11 +2857,6 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo, "the size of the current value type"); EVT ShiftTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout()); - SDValue ResultLL = Result[0]; - SDValue ResultLH = Result[1]; - SDValue ResultHL = Result[2]; - SDValue ResultHH = Result[3]; - // After getting the multiplication result in 4 parts, we need to perform a // shift right by the amount of the scale to get the result in that scale. // @@ -2876,50 +2871,22 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo, // // |NVTSize-| // - // The resulting Lo and Hi will only need to be one of these 32-bit parts - // after shifting. - if (Scale < NVTSize) { - // If the scale is less than the size of the VT we expand to, the Hi and - // Lo of the result will be in the first 2 parts of the result after - // shifting right. This only requires shifting by the scale as far as the - // third part in the result (ResultHL). - SDValue SRLAmnt = DAG.getConstant(Scale, dl, ShiftTy); - SDValue SHLAmnt = DAG.getConstant(NVTSize - Scale, dl, ShiftTy); - Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLL, SRLAmnt); - Lo = DAG.getNode(ISD::OR, dl, NVT, Lo, - DAG.getNode(ISD::SHL, dl, NVT, ResultLH, SHLAmnt)); - Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt); - Hi = DAG.getNode(ISD::OR, dl, NVT, Hi, - DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt)); - } else if (Scale == NVTSize) { - // If the scales are equal, Lo and Hi are ResultLH and ResultHL, - // respectively. Avoid shifting to prevent undefined behavior. - Lo = ResultLH; - Hi = ResultHL; - } else if (Scale < VTSize) { - // If the scale is instead less than the old VT size, but greater than or - // equal to the expanded VT size, the first part of the result (ResultLL) is - // no longer a part of Lo because it would be scaled out anyway. Instead we - // can start shifting right from the fourth part (ResultHH) to the second - // part (ResultLH), and ResultLH will be the new Lo. - SDValue SRLAmnt = DAG.getConstant(Scale - NVTSize, dl, ShiftTy); - SDValue SHLAmnt = DAG.getConstant(VTSize - Scale, dl, ShiftTy); - Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt); - Lo = DAG.getNode(ISD::OR, dl, NVT, Lo, - DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt)); - Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultHL, SRLAmnt); - Hi = DAG.getNode(ISD::OR, dl, NVT, Hi, - DAG.getNode(ISD::SHL, dl, NVT, ResultHH, SHLAmnt)); - } else if (Scale == VTSize) { - assert( - !Signed && - "Only unsigned types can have a scale equal to the operand bit width"); - - Lo = ResultHL; - Hi = ResultHH; - } else - llvm_unreachable("Expected the scale to be less than or equal to the width " - "of the operands"); + // The resulting Lo and Hi would normally be in LL and LH after the shift. But + // to avoid unneccessary shifting of all 4 parts, we can adjust the shift + // amount and get Lo and Hi using two funnel shifts. Or for the special case + // when Scale is a multiple of NVTSize we can just pick the result without + // shifting. + uint64_t Part0 = Scale / NVTSize; // Part holding lowest bit needed. + if (Scale % NVTSize) { + SDValue ShiftAmount = DAG.getConstant(Scale % NVTSize, dl, ShiftTy); + Lo = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 1], Result[Part0], + ShiftAmount); + Hi = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 2], Result[Part0 + 1], + ShiftAmount); + } else { + Lo = Result[Part0]; + Hi = Result[Part0 + 1]; + } // Unless saturation is requested we are done. The result is in . if (!Saturating) @@ -2934,6 +2901,9 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo, // highest bit of HH determines saturation direction in the event of // saturation. + SDValue ResultHL = Result[2]; + SDValue ResultHH = Result[3]; + SDValue SatMax, SatMin; SDValue NVTZero = DAG.getConstant(0, dl, NVT); SDValue NVTNeg1 = DAG.getConstant(-1, dl, NVT); diff --git a/test/CodeGen/RISCV/addcarry.ll b/test/CodeGen/RISCV/addcarry.ll index 15fe53e9f24..5a25fb98509 100644 --- a/test/CodeGen/RISCV/addcarry.ll +++ b/test/CodeGen/RISCV/addcarry.ll @@ -34,10 +34,10 @@ define i64 @addcarry(i64 %x, i64 %y) nounwind { ; RISCV32-NEXT: mul a0, a0, a2 ; RISCV32-NEXT: srli a0, a0, 2 ; RISCV32-NEXT: slli a1, a6, 30 -; RISCV32-NEXT: or a0, a0, a1 +; RISCV32-NEXT: or a0, a1, a0 ; RISCV32-NEXT: srli a1, a6, 2 ; RISCV32-NEXT: slli a2, a5, 30 -; RISCV32-NEXT: or a1, a1, a2 +; RISCV32-NEXT: or a1, a2, a1 ; RISCV32-NEXT: ret %tmp = call i64 @llvm.smul.fix.i64(i64 %x, i64 %y, i32 2); ret i64 %tmp; diff --git a/test/CodeGen/X86/smul_fix.ll b/test/CodeGen/X86/smul_fix.ll index 80ed91fee17..c34730b1125 100644 --- a/test/CodeGen/X86/smul_fix.ll +++ b/test/CodeGen/X86/smul_fix.ll @@ -374,26 +374,25 @@ define i64 @func8(i64 %x, i64 %y) nounwind { ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: addl %edx, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebx, %esi +; X86-NEXT: cmovnsl %ebp, %esi ; X86-NEXT: cmovnsl %edx, %ecx ; X86-NEXT: movl %ecx, %edi ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi diff --git a/test/CodeGen/X86/smul_fix_sat.ll b/test/CodeGen/X86/smul_fix_sat.ll index d1f864c5731..284f51d7422 100644 --- a/test/CodeGen/X86/smul_fix_sat.ll +++ b/test/CodeGen/X86/smul_fix_sat.ll @@ -60,7 +60,6 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: subl $8, %esp ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %ecx, %eax @@ -69,64 +68,61 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %edx, %ebp ; X86-NEXT: addl %ebx, %ebp -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: adcl $0, %edi -; X86-NEXT: movl %ebx, %eax ; X86-NEXT: imull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: movl %eax, %esi -; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl %edx, %esi +; X86-NEXT: movl %eax, %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: addl %esi, %edx -; X86-NEXT: adcl $0, %ecx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: sbbl $0, %edi -; X86-NEXT: testl %ebx, %ebx -; X86-NEXT: cmovnsl %ecx, %edi -; X86-NEXT: cmovnsl %edx, %esi -; X86-NEXT: movl %esi, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: addl %ebx, %edx +; X86-NEXT: adcl $0, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: subl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %esi, %ebx +; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: cmovnsl %esi, %ebx +; X86-NEXT: cmovnsl %edx, %edi ; X86-NEXT: movl %edi, %ebp -; X86-NEXT: sbbl $0, %ebp +; X86-NEXT: subl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebx, %esi +; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X86-NEXT: cmovnsl %ebx, %esi ; X86-NEXT: cmovnsl %edi, %ebp -; X86-NEXT: cmovnsl %esi, %ecx -; X86-NEXT: testl %ebp, %ebp -; X86-NEXT: setg %bh -; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill -; X86-NEXT: cmpl $1, %ecx -; X86-NEXT: seta %bl -; X86-NEXT: movl %ecx, %edx -; X86-NEXT: shldl $30, %eax, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NEXT: shldl $30, %esi, %eax -; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload -; X86-NEXT: orb %bh, %bl -; X86-NEXT: testb %bl, %bl -; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF -; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: movl $-1, %esi -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: cmpl $-1, %ebp -; X86-NEXT: setl %bl +; X86-NEXT: testl %esi, %esi +; X86-NEXT: setg %bl ; X86-NEXT: sete %bh -; X86-NEXT: cmpl $-2, %ecx -; X86-NEXT: setb %cl -; X86-NEXT: andb %bh, %cl -; X86-NEXT: xorl %esi, %esi -; X86-NEXT: orb %bl, %cl -; X86-NEXT: cmovnel %esi, %eax -; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 -; X86-NEXT: cmovnel %ecx, %edx -; X86-NEXT: addl $8, %esp +; X86-NEXT: cmpl $1, %ebp +; X86-NEXT: seta %dl +; X86-NEXT: andb %bh, %dl +; X86-NEXT: orb %bl, %dl +; X86-NEXT: shrdl $2, %eax, %ecx +; X86-NEXT: shrdl $2, %ebp, %eax +; X86-NEXT: testb %dl, %dl +; X86-NEXT: movl $2147483647, %edi # imm = 0x7FFFFFFF +; X86-NEXT: cmovel %eax, %edi +; X86-NEXT: movl $-1, %eax +; X86-NEXT: cmovnel %eax, %ecx +; X86-NEXT: cmpl $-1, %esi +; X86-NEXT: setl %al +; X86-NEXT: sete %dl +; X86-NEXT: cmpl $-2, %ebp +; X86-NEXT: setb %ah +; X86-NEXT: andb %dl, %ah +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: orb %al, %ah +; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000 +; X86-NEXT: cmovel %edi, %edx +; X86-NEXT: movl %ecx, %eax ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -688,44 +684,42 @@ define i64 @func8(i64 %x, i64 %y) nounwind { ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: addl %ebx, %ebp +; X86-NEXT: addl %edx, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl %esi, %eax ; X86-NEXT: imull {{[0-9]+}}(%esp) -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ebp ; X86-NEXT: movl %eax, %ecx ; X86-NEXT: movl %esi, %eax ; X86-NEXT: mull {{[0-9]+}}(%esp) -; X86-NEXT: addl %ebp, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: addl %ecx, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ebp ; X86-NEXT: movl %edx, %ecx ; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ebx, %esi +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: sbbl $0, %esi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %ebx, %esi +; X86-NEXT: cmovnsl %ebp, %esi ; X86-NEXT: cmovnsl %edx, %ecx -; X86-NEXT: movl %ecx, %edi -; X86-NEXT: subl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %ebx -; X86-NEXT: sbbl $0, %ebx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %esi, %edi +; X86-NEXT: sbbl $0, %edi ; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp) -; X86-NEXT: cmovnsl %esi, %ebx -; X86-NEXT: cmovnsl %ecx, %edi -; X86-NEXT: movl %ebx, %edx -; X86-NEXT: shldl $1, %edi, %edx -; X86-NEXT: shrdl $31, %edi, %eax -; X86-NEXT: cmpl $1073741823, %ebx # imm = 0x3FFFFFFF +; X86-NEXT: cmovnsl %esi, %edi +; X86-NEXT: cmovnsl %ecx, %edx +; X86-NEXT: shrdl $31, %edx, %eax +; X86-NEXT: shrdl $31, %edi, %edx +; X86-NEXT: cmpl $1073741823, %edi # imm = 0x3FFFFFFF ; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF ; X86-NEXT: cmovgl %ecx, %edx ; X86-NEXT: movl $-1, %ecx ; X86-NEXT: cmovgl %ecx, %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: cmpl $-1073741824, %ebx # imm = 0xC0000000 +; X86-NEXT: cmpl $-1073741824, %edi # imm = 0xC0000000 ; X86-NEXT: cmovll %ecx, %eax ; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000 ; X86-NEXT: cmovll %ecx, %edx diff --git a/test/CodeGen/X86/umul_fix.ll b/test/CodeGen/X86/umul_fix.ll index e4277deea14..8481fe4ac6b 100644 --- a/test/CodeGen/X86/umul_fix.ll +++ b/test/CodeGen/X86/umul_fix.ll @@ -60,9 +60,10 @@ define i64 @func2(i64 %x, i64 %y) nounwind { ; X86-NEXT: addl %ebp, %eax ; X86-NEXT: adcl %edi, %edx ; X86-NEXT: imull {{[0-9]+}}(%esp), %esi -; X86-NEXT: addl %esi, %edx -; X86-NEXT: shldl $30, %eax, %edx +; X86-NEXT: addl %edx, %esi +; X86-NEXT: shldl $30, %eax, %esi ; X86-NEXT: shldl $30, %ecx, %eax +; X86-NEXT: movl %esi, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -318,23 +319,22 @@ define i64 @func8(i64 %x, i64 %y) nounwind { ; X86-NEXT: movl %eax, %ebx ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: mull %esi -; X86-NEXT: movl %edx, %ecx -; X86-NEXT: addl %ebx, %ecx +; X86-NEXT: addl %edx, %ebx ; X86-NEXT: adcl $0, %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %ebp -; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl %edx, %ecx ; X86-NEXT: movl %eax, %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: mull %esi -; X86-NEXT: addl %ecx, %eax +; X86-NEXT: addl %ebx, %eax ; X86-NEXT: adcl %edi, %edx -; X86-NEXT: adcl $0, %ebx +; X86-NEXT: adcl $0, %ecx ; X86-NEXT: addl %ebp, %edx -; X86-NEXT: adcl $0, %ebx -; X86-NEXT: shldl $1, %edx, %ebx +; X86-NEXT: adcl $0, %ecx +; X86-NEXT: shldl $1, %edx, %ecx ; X86-NEXT: shrdl $31, %edx, %eax -; X86-NEXT: movl %ebx, %edx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx -- 2.40.0