"the size of the current value type");
EVT ShiftTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
- SDValue ResultLL = Result[0];
- SDValue ResultLH = Result[1];
- SDValue ResultHL = Result[2];
- SDValue ResultHH = Result[3];
-
// After getting the multiplication result in 4 parts, we need to perform a
// shift right by the amount of the scale to get the result in that scale.
//
//
// |NVTSize-|
//
- // The resulting Lo and Hi will only need to be one of these 32-bit parts
- // after shifting.
- if (Scale < NVTSize) {
- // If the scale is less than the size of the VT we expand to, the Hi and
- // Lo of the result will be in the first 2 parts of the result after
- // shifting right. This only requires shifting by the scale as far as the
- // third part in the result (ResultHL).
- SDValue SRLAmnt = DAG.getConstant(Scale, dl, ShiftTy);
- SDValue SHLAmnt = DAG.getConstant(NVTSize - Scale, dl, ShiftTy);
- Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLL, SRLAmnt);
- Lo = DAG.getNode(ISD::OR, dl, NVT, Lo,
- DAG.getNode(ISD::SHL, dl, NVT, ResultLH, SHLAmnt));
- Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt);
- Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
- DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt));
- } else if (Scale == NVTSize) {
- // If the scales are equal, Lo and Hi are ResultLH and ResultHL,
- // respectively. Avoid shifting to prevent undefined behavior.
- Lo = ResultLH;
- Hi = ResultHL;
- } else if (Scale < VTSize) {
- // If the scale is instead less than the old VT size, but greater than or
- // equal to the expanded VT size, the first part of the result (ResultLL) is
- // no longer a part of Lo because it would be scaled out anyway. Instead we
- // can start shifting right from the fourth part (ResultHH) to the second
- // part (ResultLH), and ResultLH will be the new Lo.
- SDValue SRLAmnt = DAG.getConstant(Scale - NVTSize, dl, ShiftTy);
- SDValue SHLAmnt = DAG.getConstant(VTSize - Scale, dl, ShiftTy);
- Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt);
- Lo = DAG.getNode(ISD::OR, dl, NVT, Lo,
- DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt));
- Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultHL, SRLAmnt);
- Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
- DAG.getNode(ISD::SHL, dl, NVT, ResultHH, SHLAmnt));
- } else if (Scale == VTSize) {
- assert(
- !Signed &&
- "Only unsigned types can have a scale equal to the operand bit width");
-
- Lo = ResultHL;
- Hi = ResultHH;
- } else
- llvm_unreachable("Expected the scale to be less than or equal to the width "
- "of the operands");
+ // The resulting Lo and Hi would normally be in LL and LH after the shift. But
+ // to avoid unneccessary shifting of all 4 parts, we can adjust the shift
+ // amount and get Lo and Hi using two funnel shifts. Or for the special case
+ // when Scale is a multiple of NVTSize we can just pick the result without
+ // shifting.
+ uint64_t Part0 = Scale / NVTSize; // Part holding lowest bit needed.
+ if (Scale % NVTSize) {
+ SDValue ShiftAmount = DAG.getConstant(Scale % NVTSize, dl, ShiftTy);
+ Lo = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 1], Result[Part0],
+ ShiftAmount);
+ Hi = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 2], Result[Part0 + 1],
+ ShiftAmount);
+ } else {
+ Lo = Result[Part0];
+ Hi = Result[Part0 + 1];
+ }
// Unless saturation is requested we are done. The result is in <Hi,Lo>.
if (!Saturating)
// highest bit of HH determines saturation direction in the event of
// saturation.
+ SDValue ResultHL = Result[2];
+ SDValue ResultHH = Result[3];
+
SDValue SatMax, SatMin;
SDValue NVTZero = DAG.getConstant(0, dl, NVT);
SDValue NVTNeg1 = DAG.getConstant(-1, dl, NVT);
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $8, %esp
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %ebx, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %ebx, %eax
; X86-NEXT: imull %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: subl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: sbbl $0, %edi
-; X86-NEXT: testl %ebx, %ebx
-; X86-NEXT: cmovnsl %ecx, %edi
-; X86-NEXT: cmovnsl %edx, %esi
-; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: addl %ebx, %edx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: sbbl $0, %ebx
+; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnsl %esi, %ebx
+; X86-NEXT: cmovnsl %edx, %edi
; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: sbbl $0, %ebp
+; X86-NEXT: subl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: sbbl $0, %esi
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: cmovnsl %ebx, %esi
; X86-NEXT: cmovnsl %edi, %ebp
-; X86-NEXT: cmovnsl %esi, %ecx
-; X86-NEXT: testl %ebp, %ebp
-; X86-NEXT: setg %bh
-; X86-NEXT: sete {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: cmpl $1, %ecx
-; X86-NEXT: seta %bl
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shldl $30, %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl $30, %esi, %eax
-; X86-NEXT: andb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
-; X86-NEXT: orb %bh, %bl
-; X86-NEXT: testb %bl, %bl
-; X86-NEXT: movl $2147483647, %esi # imm = 0x7FFFFFFF
-; X86-NEXT: cmovnel %esi, %edx
-; X86-NEXT: movl $-1, %esi
-; X86-NEXT: cmovnel %esi, %eax
-; X86-NEXT: cmpl $-1, %ebp
-; X86-NEXT: setl %bl
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: setg %bl
; X86-NEXT: sete %bh
-; X86-NEXT: cmpl $-2, %ecx
-; X86-NEXT: setb %cl
-; X86-NEXT: andb %bh, %cl
-; X86-NEXT: xorl %esi, %esi
-; X86-NEXT: orb %bl, %cl
-; X86-NEXT: cmovnel %esi, %eax
-; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000
-; X86-NEXT: cmovnel %ecx, %edx
-; X86-NEXT: addl $8, %esp
+; X86-NEXT: cmpl $1, %ebp
+; X86-NEXT: seta %dl
+; X86-NEXT: andb %bh, %dl
+; X86-NEXT: orb %bl, %dl
+; X86-NEXT: shrdl $2, %eax, %ecx
+; X86-NEXT: shrdl $2, %ebp, %eax
+; X86-NEXT: testb %dl, %dl
+; X86-NEXT: movl $2147483647, %edi # imm = 0x7FFFFFFF
+; X86-NEXT: cmovel %eax, %edi
+; X86-NEXT: movl $-1, %eax
+; X86-NEXT: cmovnel %eax, %ecx
+; X86-NEXT: cmpl $-1, %esi
+; X86-NEXT: setl %al
+; X86-NEXT: sete %dl
+; X86-NEXT: cmpl $-2, %ebp
+; X86-NEXT: setb %ah
+; X86-NEXT: andb %dl, %ah
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: orb %al, %ah
+; X86-NEXT: cmovnel %edx, %ecx
+; X86-NEXT: movl $-2147483648, %edx # imm = 0x80000000
+; X86-NEXT: cmovel %edi, %edx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: addl %edx, %ebx
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %esi, %eax
; X86-NEXT: imull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %ebp
; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %ebp
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl %ebp, %esi
; X86-NEXT: sbbl $0, %esi
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: cmovnsl %ebx, %esi
+; X86-NEXT: cmovnsl %ebp, %esi
; X86-NEXT: cmovnsl %edx, %ecx
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: sbbl $0, %ebx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: subl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: sbbl $0, %edi
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: cmovnsl %esi, %ebx
-; X86-NEXT: cmovnsl %ecx, %edi
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: shldl $1, %edi, %edx
-; X86-NEXT: shrdl $31, %edi, %eax
-; X86-NEXT: cmpl $1073741823, %ebx # imm = 0x3FFFFFFF
+; X86-NEXT: cmovnsl %esi, %edi
+; X86-NEXT: cmovnsl %ecx, %edx
+; X86-NEXT: shrdl $31, %edx, %eax
+; X86-NEXT: shrdl $31, %edi, %edx
+; X86-NEXT: cmpl $1073741823, %edi # imm = 0x3FFFFFFF
; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
; X86-NEXT: cmovgl %ecx, %edx
; X86-NEXT: movl $-1, %ecx
; X86-NEXT: cmovgl %ecx, %eax
; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: cmpl $-1073741824, %ebx # imm = 0xC0000000
+; X86-NEXT: cmpl $-1073741824, %edi # imm = 0xC0000000
; X86-NEXT: cmovll %ecx, %eax
; X86-NEXT: movl $-2147483648, %ecx # imm = 0x80000000
; X86-NEXT: cmovll %ecx, %edx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: adcl %edi, %edx
; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: shldl $30, %eax, %edx
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: shldl $30, %eax, %esi
; X86-NEXT: shldl $30, %ecx, %eax
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: addl %edx, %ebx
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %esi
-; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: shldl $1, %edx, %ebx
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: shldl $1, %edx, %ecx
; X86-NEXT: shrdl $31, %edx, %eax
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl %ecx, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx