From: Craig Topper Date: Fri, 28 Dec 2018 19:19:39 +0000 (+0000) Subject: [X86] Directly emit X86ISD::PMULUDQ from the ReplaceNodeResults handling of v2i8... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=52b836d86e6ea2a7a3a7f5106cf00552ac951521;p=llvm [X86] Directly emit X86ISD::PMULUDQ from the ReplaceNodeResults handling of v2i8/v2i16/v2i32 multiply. Previously we emitted a multiply and some masking that was supposed to matched to PMULUDQ, but the masking could sometimes be removed before we got a chance to match it. So instead just emit the PMULUDQ directly. Remove the DAG combine that was added when the ReplaceNodeResults code was originally added. Add a new DAG combine to avoid regressions in shrink_vmul.ll Some of the shrink_vmul.ll test cases now pick PMULUDQ instead of PMADDWD/PMULLD, but I think this should be an improvement on most CPUs. I think all of this can go away if/when we switch to -x86-experimental-vector-widening-legalization git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@350134 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 231e20cc02f..82083653f0d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -26246,13 +26246,9 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, // Promote to a pattern that will be turned into PMULUDQ. SDValue N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, N->getOperand(0)); - N0 = DAG.getNode(ISD::AND, dl, MVT::v2i64, N0, - DAG.getConstant(0xffffffff, dl, MVT::v2i64)); SDValue N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v2i64, N->getOperand(1)); - N1 = DAG.getNode(ISD::AND, dl, MVT::v2i64, N1, - DAG.getConstant(0xffffffff, dl, MVT::v2i64)); - SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v2i64, N0, N1); + SDValue Mul = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, N0, N1); Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, VT, Mul)); } else if (getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8) { @@ -32250,6 +32246,52 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, return SDValue(N, 0); } + // Look for a truncating shuffle to v2i32 of a PMULUDQ where one of the + // operands is an extend from v2i32 to v2i64. Turn it into a pmulld. + // FIXME: This can probably go away once we default to widening legalization. + if (Subtarget.hasSSE41() && VT == MVT::v4i32 && + N->getOpcode() == ISD::VECTOR_SHUFFLE && + N->getOperand(0).getOpcode() == ISD::BITCAST && + N->getOperand(0).getOperand(0).getOpcode() == X86ISD::PMULUDQ) { + SDValue BC = N->getOperand(0); + SDValue MULUDQ = BC.getOperand(0); + ShuffleVectorSDNode *SVOp = cast(N); + ArrayRef Mask = SVOp->getMask(); + if (BC.hasOneUse() && MULUDQ.hasOneUse() && + Mask[0] == 0 && Mask[1] == 2 && Mask[2] == -1 && Mask[3] == -1) { + SDValue Op0 = MULUDQ.getOperand(0); + SDValue Op1 = MULUDQ.getOperand(1); + if (Op0.getOpcode() == ISD::BITCAST && + Op0.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && + Op0.getOperand(0).getValueType() == MVT::v4i32) { + ShuffleVectorSDNode *SVOp0 = + cast(Op0.getOperand(0)); + ArrayRef Mask2 = SVOp0->getMask(); + if (Mask2[0] == 0 && Mask2[1] == -1 && + Mask2[2] == 1 && Mask2[3] == -1) { + Op0 = SVOp0->getOperand(0); + Op1 = DAG.getBitcast(MVT::v4i32, Op1); + Op1 = DAG.getVectorShuffle(MVT::v4i32, dl, Op1, Op1, Mask); + return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1); + } + } + if (Op1.getOpcode() == ISD::BITCAST && + Op1.getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && + Op1.getOperand(0).getValueType() == MVT::v4i32) { + ShuffleVectorSDNode *SVOp1 = + cast(Op1.getOperand(0)); + ArrayRef Mask2 = SVOp1->getMask(); + if (Mask2[0] == 0 && Mask2[1] == -1 && + Mask2[2] == 1 && Mask2[3] == -1) { + Op0 = DAG.getBitcast(MVT::v4i32, Op0); + Op0 = DAG.getVectorShuffle(MVT::v4i32, dl, Op0, Op0, Mask); + Op1 = SVOp1->getOperand(0); + return DAG.getNode(ISD::MUL, dl, MVT::v4i32, Op0, Op1); + } + } + } + } + return SDValue(); } @@ -35107,26 +35149,6 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { EVT VT = N->getValueType(0); - // Look for multiply of 2 identical shuffles with a zero vector. Shuffle the - // result and insert the zero there instead. This can occur due to - // type legalization of v2i32 multiply to a PMULUDQ pattern. - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if (!DCI.isBeforeLegalize() && isa(LHS) && - isa(RHS) && LHS.hasOneUse() && RHS.hasOneUse() && - LHS.getOperand(1) == RHS.getOperand(1) && - ISD::isBuildVectorAllZeros(LHS.getOperand(1).getNode())) { - ShuffleVectorSDNode *SVN0 = cast(LHS); - ShuffleVectorSDNode *SVN1 = cast(RHS); - if (SVN0->getMask().equals(SVN1->getMask())) { - SDLoc dl(N); - SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, LHS.getOperand(0), - RHS.getOperand(0)); - return DAG.getVectorShuffle(VT, dl, Mul, DAG.getConstant(0, dl, VT), - SVN0->getMask()); - } - } - if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget)) return V; diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll index 97290730d90..b2fa8192abf 100644 --- a/test/CodeGen/X86/shrink_vmul.ll +++ b/test/CodeGen/X86/shrink_vmul.ll @@ -44,7 +44,7 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X86-AVX-NEXT: movl c, %esi ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi @@ -70,7 +70,7 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq @@ -916,7 +916,7 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, ; X86-AVX-NEXT: movl c, %esi ; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 ; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1 -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi @@ -944,7 +944,7 @@ define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 ; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1 -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq @@ -1004,7 +1004,7 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl ; X86-AVX-NEXT: movl c, %esi ; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi @@ -1033,7 +1033,7 @@ define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonl ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq @@ -1087,7 +1087,7 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b ; X86-AVX-NEXT: movl c, %esi ; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 ; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1 -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X86-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi @@ -1110,7 +1110,7 @@ define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 ; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1 -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X64-AVX-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq @@ -1169,9 +1169,8 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon ; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) ; X86-AVX-NEXT: popl %esi ; X86-AVX-NEXT: retl @@ -1198,9 +1197,8 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) ; X64-AVX-NEXT: retq entry: @@ -1406,7 +1404,7 @@ define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl @@ -1430,7 +1428,7 @@ define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-NEXT: movl $255, %ecx ; X64-AVX-NEXT: vmovq %rcx, %xmm1 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq @@ -1474,7 +1472,7 @@ define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl @@ -1496,7 +1494,7 @@ define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq @@ -1542,7 +1540,7 @@ define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl @@ -1569,7 +1567,7 @@ define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100 ; X64-AVX-NEXT: vmovq %rcx, %xmm1 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq @@ -1615,7 +1613,7 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl @@ -1639,7 +1637,7 @@ define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq @@ -1685,7 +1683,7 @@ define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl @@ -1709,7 +1707,7 @@ define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq @@ -1755,7 +1753,7 @@ define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl @@ -1779,7 +1777,7 @@ define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq @@ -1823,9 +1821,7 @@ define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1846,12 +1842,11 @@ define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X64-AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF ; X64-AVX-NEXT: vmovq %rcx, %xmm1 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -1893,7 +1888,7 @@ define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl @@ -1914,7 +1909,7 @@ define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { ; X64-AVX: # %bb.0: # %entry ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 +; X64-AVX-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq @@ -1958,9 +1953,7 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl ; @@ -1984,12 +1977,11 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; X64-AVX-NEXT: movl $65536, %ecx # imm = 0x10000 ; X64-AVX-NEXT: vmovq %rcx, %xmm1 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq entry: @@ -2031,7 +2023,7 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-AVX-NEXT: movl c, %edx ; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X86-AVX-NEXT: vpmuludq {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) ; X86-AVX-NEXT: retl @@ -2058,7 +2050,7 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { ; X64-AVX-NEXT: movl $32768, %ecx # imm = 0x8000 ; X64-AVX-NEXT: vmovq %rcx, %xmm1 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 +; X64-AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) ; X64-AVX-NEXT: retq diff --git a/test/CodeGen/X86/vector-reduce-mul.ll b/test/CodeGen/X86/vector-reduce-mul.ll index ea80a5be2a6..9d010973a85 100644 --- a/test/CodeGen/X86/vector-reduce-mul.ll +++ b/test/CodeGen/X86/vector-reduce-mul.ll @@ -790,35 +790,12 @@ define i32 @test_v2i32(<2 x i32> %a0) { ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v2i32: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovd %xmm0, %eax -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v2i32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v2i32: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovd %xmm0, %eax -; AVX512DQVL-NEXT: retq +; AVX512-LABEL: test_v2i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: retq %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v2i32(<2 x i32> %a0) ret i32 %1 } @@ -1156,39 +1133,13 @@ define i16 @test_v2i16(<2 x i16> %a0) { ; AVX-NEXT: # kill: def $ax killed $ax killed $eax ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v2i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovd %xmm0, %eax -; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v2i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovd %xmm0, %eax -; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512DQVL-NEXT: retq +; AVX512-LABEL: test_v2i16: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vmovd %xmm0, %eax +; AVX512-NEXT: # kill: def $ax killed $ax killed $eax +; AVX512-NEXT: retq %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v2i16(<2 x i16> %a0) ret i16 %1 } @@ -1634,39 +1585,13 @@ define i8 @test_v2i8(<2 x i8> %a0) { ; AVX-NEXT: # kill: def $al killed $al killed $eax ; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax -; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v2i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQVL-NEXT: retq +; AVX512-LABEL: test_v2i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v2i8(<2 x i8> %a0) ret i8 %1 }