// Handle special case opcodes.
switch (Opc) {
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ: {
+ APInt LHSUndef, LHSZero;
+ APInt RHSUndef, RHSZero;
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO,
+ Depth + 1))
+ return true;
+ if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO,
+ Depth + 1))
+ return true;
+ // Multiply by zero.
+ KnownZero = LHSZero | RHSZero;
+ break;
+ }
case X86ISD::VSHL:
case X86ISD::VSRL:
case X86ISD::VSRA: {
SDValue Insert =
insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
return TLO.CombineTo(Op, Insert);
- }
+ }
+ // Arithmetic Ops.
+ case X86ISD::PMULDQ:
+ case X86ISD::PMULUDQ:
// Target Shuffles.
case X86ISD::PSHUFB:
case X86ISD::UNPCKL:
SDValue RHS = Op.getOperand(1);
// FIXME: Can we bound this better?
APInt DemandedMask = APInt::getLowBitsSet(64, 32);
- if (SimplifyDemandedBits(LHS, DemandedMask, KnownOp, TLO, Depth + 1))
+ if (SimplifyDemandedBits(LHS, DemandedMask, OriginalDemandedElts, KnownOp,
+ TLO, Depth + 1))
return true;
- if (SimplifyDemandedBits(RHS, DemandedMask, KnownOp, TLO, Depth + 1))
+ if (SimplifyDemandedBits(RHS, DemandedMask, OriginalDemandedElts, KnownOp,
+ TLO, Depth + 1))
return true;
break;
}
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
-; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3
-; AVX512BW-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX512BW-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BWVL-LABEL: test_v4i64:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2
-; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, %rax
; AVX512BWVL-NEXT: vzeroupper
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
-; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, %rax
; AVX512BWVL-NEXT: vzeroupper
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
-; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, %rax
; AVX512BWVL-NEXT: vzeroupper
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
-; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
-; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3
-; AVX512BW-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX512BW-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BWVL-LABEL: test_v4i64:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2
-; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, %rax
; AVX512BWVL-NEXT: vzeroupper
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
-; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, %rax
; AVX512BWVL-NEXT: vzeroupper
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
-; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3
-; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vmovq %xmm0, %rax
; AVX2-NEXT: vzeroupper
; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BW-NEXT: vmovq %xmm0, %rax
; AVX512BW-NEXT: vzeroupper
; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
+; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
+; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
+; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
+; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
+; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovq %xmm0, %rax
; AVX512BWVL-NEXT: vzeroupper