// Try to turn a build vector of zero extends of extract vector elts into a
// a vector zero extend and possibly an extract subvector.
-// TODO: Support sign extend or any extend?
+// TODO: Support sign extend?
// TODO: Allow undef elements?
SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) {
if (LegalOperations)
EVT VT = N->getValueType(0);
+ bool FoundZeroExtend = false;
SDValue Op0 = N->getOperand(0);
auto checkElem = [&](SDValue Op) -> int64_t {
- if (Op.getOpcode() == ISD::ZERO_EXTEND &&
+ unsigned Opc = Op.getOpcode();
+ FoundZeroExtend |= (Opc == ISD::ZERO_EXTEND);
+ if ((Op.getOpcode() == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND) &&
Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0))
if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(0).getOperand(1)))
SDLoc DL(N);
In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In,
Op0.getOperand(0).getOperand(1));
- return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, In);
+ return DAG.getNode(FoundZeroExtend ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, DL,
+ VT, In);
}
SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
Known.Zero &= Known2.Zero;
}
return false; // Don't fall through, will infinitely loop.
+ case ISD::INSERT_VECTOR_ELT: {
+ SDValue Vec = Op.getOperand(0);
+ SDValue Scl = Op.getOperand(1);
+ auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ EVT VecVT = Vec.getValueType();
+
+ // If index isn't constant, assume we need all vector elements AND the
+ // inserted element.
+ APInt DemandedVecElts(OriginalDemandedElts);
+ if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) {
+ unsigned Idx = CIdx->getZExtValue();
+ DemandedVecElts.clearBit(Idx);
+
+ // Inserted element is not required.
+ if (!OriginalDemandedElts[Idx])
+ return TLO.CombineTo(Op, Vec);
+ }
+
+ KnownBits KnownScl;
+ unsigned NumSclBits = Scl.getScalarValueSizeInBits();
+ APInt DemandedSclBits = OriginalDemandedBits.zextOrTrunc(NumSclBits);
+ if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
+ return true;
+
+ Known = KnownScl.zextOrTrunc(BitWidth, false);
+
+ KnownBits KnownVec;
+ if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts,
+ KnownVec, TLO, Depth + 1))
+ return true;
+
+ if (!!DemandedVecElts) {
+ Known.One &= KnownVec.One;
+ Known.Zero &= KnownVec.Zero;
+ }
+
+ return false;
+ }
case ISD::CONCAT_VECTORS: {
Known.Zero.setAllBits();
Known.One.setAllBits();
; X32: # %bb.0:
; X32-NEXT: pushl %eax
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X32-NEXT: shrdl $30, %ecx, %eax
+; X32-NEXT: movl %eax, %ecx
; X32-NEXT: sarl $30, %ecx
+; X32-NEXT: shll $2, %eax
; X32-NEXT: vmovd %eax, %xmm0
; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; X32-NEXT: vpsrlq $3, %xmm0, %xmm0
; %ext0 = zext <2 x i32> %0 to <2 x i64>
; %ext1 = zext <2 x i32> %1 to <2 x i64>
define <2 x i64> @_mul2xi64toi64a(<2 x i64>, <2 x i64>) {
-; SSE2-LABEL: _mul2xi64toi64a:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE42-LABEL: _mul2xi64toi64a:
-; SSE42: # %bb.0:
-; SSE42-NEXT: pmuludq %xmm1, %xmm0
-; SSE42-NEXT: retq
+; SSE-LABEL: _mul2xi64toi64a:
+; SSE: # %bb.0:
+; SSE-NEXT: pmuludq %xmm1, %xmm0
+; SSE-NEXT: retq
;
; AVX-LABEL: _mul2xi64toi64a:
; AVX: # %bb.0:
define <16 x i8> @insert_dup_mem_v16i8_sext_i8(i8* %ptr) {
; SSE2-LABEL: insert_dup_mem_v16i8_sext_i8:
; SSE2: # %bb.0:
-; SSE2-NEXT: movsbl (%rdi), %eax
+; SSE2-NEXT: movzbl (%rdi), %eax
; SSE2-NEXT: movd %eax, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
;
; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8:
; SSSE3: # %bb.0:
-; SSSE3-NEXT: movsbl (%rdi), %eax
+; SSSE3-NEXT: movzbl (%rdi), %eax
; SSSE3-NEXT: movd %eax, %xmm0
; SSSE3-NEXT: pxor %xmm1, %xmm1
; SSSE3-NEXT: pshufb %xmm1, %xmm0
;
; SSE41-LABEL: insert_dup_mem_v16i8_sext_i8:
; SSE41: # %bb.0:
-; SSE41-NEXT: movsbl (%rdi), %eax
+; SSE41-NEXT: movzbl (%rdi), %eax
; SSE41-NEXT: movd %eax, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm1
; SSE41-NEXT: pshufb %xmm1, %xmm0
;
; AVX1-LABEL: insert_dup_mem_v16i8_sext_i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: movsbl (%rdi), %eax
+; AVX1-NEXT: movzbl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
define <8 x i16> @insert_dup_mem_v8i16_sext_i16(i16* %ptr) {
; SSE-LABEL: insert_dup_mem_v8i16_sext_i16:
; SSE: # %bb.0:
-; SSE-NEXT: movswl (%rdi), %eax
+; SSE-NEXT: movzwl (%rdi), %eax
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
;
; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: movswl (%rdi), %eax
+; AVX1-NEXT: movzwl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
;
; AVX2-LABEL: insert_dup_mem_v8i16_sext_i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: movzwl (%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_mem_v8i16_sext_i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
+; AVX512VL-NEXT: movzwl (%rdi), %eax
; AVX512VL-NEXT: vpbroadcastw %eax, %xmm0
; AVX512VL-NEXT: retq
%tmp = load i16, i16* %ptr, align 2
define <16 x i16> @insert_dup_mem_v16i16_sext_i16(i16* %ptr) {
; AVX1-LABEL: insert_dup_mem_v16i16_sext_i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: movswl (%rdi), %eax
+; AVX1-NEXT: movzwl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
;
; AVX2-LABEL: insert_dup_mem_v16i16_sext_i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: movswl (%rdi), %eax
+; AVX2-NEXT: movzwl (%rdi), %eax
; AVX2-NEXT: vmovd %eax, %xmm0
; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512VL-LABEL: insert_dup_mem_v16i16_sext_i16:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movswl (%rdi), %eax
+; AVX512VL-NEXT: movzwl (%rdi), %eax
; AVX512VL-NEXT: vpbroadcastw %eax, %ymm0
; AVX512VL-NEXT: retq
%tmp = load i16, i16* %ptr, align 2
define <32 x i8> @insert_dup_mem_v32i8_sext_i8(i8* %ptr) {
; AVX1-LABEL: insert_dup_mem_v32i8_sext_i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: movsbl (%rdi), %eax
+; AVX1-NEXT: movzbl (%rdi), %eax
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) {
; KNL-LABEL: insert_dup_mem_v32i16_sext_i16:
; KNL: ## %bb.0:
-; KNL-NEXT: movswl (%rdi), %eax
+; KNL-NEXT: movzwl (%rdi), %eax
; KNL-NEXT: vmovd %eax, %xmm0
; KNL-NEXT: vpbroadcastw %xmm0, %ymm0
; KNL-NEXT: vmovdqa %ymm0, %ymm1
;
; SKX-LABEL: insert_dup_mem_v32i16_sext_i16:
; SKX: ## %bb.0:
-; SKX-NEXT: movswl (%rdi), %eax
+; SKX-NEXT: movzwl (%rdi), %eax
; SKX-NEXT: vpbroadcastw %eax, %zmm0
; SKX-NEXT: retq
%tmp = load i16, i16* %ptr, align 2