SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
unsigned HiOp);
SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
+ SDValue CombineExtLoad(SDNode *N);
SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
SDValue BuildSDIV(SDNode *N);
SDValue BuildSDIVPow2(SDNode *N);
}
}
+// FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
+SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
+ SDValue N0 = N->getOperand(0);
+ EVT DstVT = N->getValueType(0);
+ EVT SrcVT = N0.getValueType();
+
+ assert((N->getOpcode() == ISD::SIGN_EXTEND ||
+ N->getOpcode() == ISD::ZERO_EXTEND) &&
+ "Unexpected node type (not an extend)!");
+
+ // fold (sext (load x)) to multiple smaller sextloads; same for zext.
+ // For example, on a target with legal v4i32, but illegal v8i32, turn:
+ // (v8i32 (sext (v8i16 (load x))))
+ // into:
+ // (v8i32 (concat_vectors (v4i32 (sextload x)),
+ // (v4i32 (sextload (x + 16)))))
+ // Where uses of the original load, i.e.:
+ // (v8i16 (load x))
+ // are replaced with:
+ // (v8i16 (truncate
+ // (v8i32 (concat_vectors (v4i32 (sextload x)),
+ // (v4i32 (sextload (x + 16)))))))
+ //
+ // This combine is only applicable to illegal, but splittable, vectors.
+ // All legal types, and illegal non-vector types, are handled elsewhere.
+ // This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
+ //
+ if (N0->getOpcode() != ISD::LOAD)
+ return SDValue();
+
+ LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+
+ if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
+ !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() ||
+ !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+ return SDValue();
+
+ SmallVector<SDNode *, 4> SetCCs;
+ if (!ExtendUsesToFormExtLoad(N, N0, N->getOpcode(), SetCCs, TLI))
+ return SDValue();
+
+ ISD::LoadExtType ExtType =
+ N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+
+ // Try to split the vector types to get down to legal types.
+ EVT SplitSrcVT = SrcVT;
+ EVT SplitDstVT = DstVT;
+ while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
+ SplitSrcVT.getVectorNumElements() > 1) {
+ SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
+ SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
+ }
+
+ if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
+ return SDValue();
+
+ SDLoc DL(N);
+ const unsigned NumSplits =
+ DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
+ const unsigned Stride = SplitSrcVT.getStoreSize();
+ SmallVector<SDValue, 4> Loads;
+ SmallVector<SDValue, 4> Chains;
+
+ SDValue BasePtr = LN0->getBasePtr();
+ for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
+ const unsigned Offset = Idx * Stride;
+ const unsigned Align = MinAlign(LN0->getAlignment(), Offset);
+
+ SDValue SplitLoad = DAG.getExtLoad(
+ ExtType, DL, SplitDstVT, LN0->getChain(), BasePtr,
+ LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT,
+ LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(),
+ Align, LN0->getAAInfo());
+
+ BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
+ DAG.getConstant(Stride, BasePtr.getValueType()));
+
+ Loads.push_back(SplitLoad.getValue(0));
+ Chains.push_back(SplitLoad.getValue(1));
+ }
+
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
+
+ CombineTo(N, NewValue);
+
+ // Replace uses of the original load (before extension)
+ // with a truncate of the concatenated sextloaded vectors.
+ SDValue Trunc =
+ DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
+ CombineTo(N0.getNode(), Trunc, NewChain);
+ ExtendSetCCUses(SetCCs, Trunc, NewValue, DL,
+ (ISD::NodeType)N->getOpcode());
+ return SDValue(N, 0); // Return N so it doesn't get rechecked!
+}
+
SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
}
// fold (sext (load x)) -> (sext (truncate (sextload x)))
- // None of the supported targets knows how to perform load and sign extend
- // on vectors in one instruction. We only perform this transformation on
- // scalars.
- if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
- ISD::isUNINDEXEDLoad(N0.getNode()) &&
- ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+ // Only generate vector extloads when 1) they're legal, and 2) they are
+ // deemed desirable by the target.
+ if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+ ((!LegalOperations && !VT.isVector() &&
+ !cast<LoadSDNode>(N0)->isVolatile()) ||
TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) {
bool DoXform = true;
SmallVector<SDNode*, 4> SetCCs;
if (!N0.hasOneUse())
DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
+ if (VT.isVector())
+ DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
if (DoXform) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
}
}
+ // fold (sext (load x)) to multiple smaller sextloads.
+ // Only on illegal but splittable vectors.
+ if (SDValue ExtLoad = CombineExtLoad(N))
+ return ExtLoad;
+
// fold (sext (sextload x)) -> (sext (truncate (sextload x)))
// fold (sext ( extload x)) -> (sext (truncate (sextload x)))
if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
}
// fold (zext (load x)) -> (zext (truncate (zextload x)))
- // None of the supported targets knows how to perform load and vector_zext
- // on vectors in one instruction. We only perform this transformation on
- // scalars.
- if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
- ISD::isUNINDEXEDLoad(N0.getNode()) &&
- ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+ // Only generate vector extloads when 1) they're legal, and 2) they are
+ // deemed desirable by the target.
+ if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+ ((!LegalOperations && !VT.isVector() &&
+ !cast<LoadSDNode>(N0)->isVolatile()) ||
TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) {
bool DoXform = true;
SmallVector<SDNode*, 4> SetCCs;
if (!N0.hasOneUse())
DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
+ if (VT.isVector())
+ DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
if (DoXform) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
}
}
+ // fold (zext (load x)) to multiple smaller zextloads.
+ // Only on illegal but splittable vectors.
+ if (SDValue ExtLoad = CombineExtLoad(N))
+ return ExtLoad;
+
// fold (zext (and/or/xor (load x), cst)) ->
// (and/or/xor (zextload x), (zext cst))
if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) {
; SSE2-LABEL: sext_16i8_to_16i16:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
+; SSE2-NEXT: movq (%rdi), %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psllw $8, %xmm0
; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: psllw $8, %xmm1
+; SSE2-NEXT: movq 8(%rdi), %xmm1
+; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: psraw $8, %xmm1
; SSE2-NEXT: retq
;
; SSSE3-LABEL: sext_16i8_to_16i16:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movdqa (%rdi), %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
+; SSSE3-NEXT: movq (%rdi), %xmm0
; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: psllw $8, %xmm0
; SSSE3-NEXT: psraw $8, %xmm0
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT: psllw $8, %xmm1
+; SSSE3-NEXT: movq 8(%rdi), %xmm1
+; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSSE3-NEXT: psraw $8, %xmm1
; SSSE3-NEXT: retq
;
; SSE41-LABEL: sext_16i8_to_16i16:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movdqa (%rdi), %xmm1
-; SSE41-NEXT: pmovzxbw %xmm1, %xmm0
-; SSE41-NEXT: psllw $8, %xmm0
-; SSE41-NEXT: psraw $8, %xmm0
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: psllw $8, %xmm1
-; SSE41-NEXT: psraw $8, %xmm1
+; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
+; SSE41-NEXT: pmovsxbw 8(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: sext_16i8_to_16i16:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0
+; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: sext_16i8_to_16i16:
; X32-SSE41-LABEL: sext_16i8_to_16i16:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movdqa (%eax), %xmm1
-; X32-SSE41-NEXT: pmovzxbw %xmm1, %xmm0
-; X32-SSE41-NEXT: psllw $8, %xmm0
-; X32-SSE41-NEXT: psraw $8, %xmm0
-; X32-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X32-SSE41-NEXT: psllw $8, %xmm1
-; X32-SSE41-NEXT: psraw $8, %xmm1
+; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0
+; X32-SSE41-NEXT: pmovsxbw 8(%eax), %xmm1
; X32-SSE41-NEXT: retl
entry:
%X = load <16 x i8>* %ptr
define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
; SSE2-LABEL: load_sext_4i8_to_4i64:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movd (%rdi), %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movsbq %al, %rax
+; SSE2-NEXT: movsbq 1(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movsbq (%rdi), %rax
; SSE2-NEXT: movd %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movsbq %al, %rax
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movsbq 3(%rdi), %rax
; SSE2-NEXT: movd %rax, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movsbq %al, %rax
+; SSE2-NEXT: movsbq 2(%rdi), %rax
; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movsbq %al, %rax
-; SSE2-NEXT: movd %rax, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i8_to_4i64:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movd (%rdi), %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movsbq %al, %rax
+; SSSE3-NEXT: movsbq 1(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movsbq (%rdi), %rax
; SSSE3-NEXT: movd %rax, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movsbq %al, %rax
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movsbq 3(%rdi), %rax
; SSSE3-NEXT: movd %rax, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movsbq %al, %rax
+; SSSE3-NEXT: movsbq 2(%rdi), %rax
; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movsbq %al, %rax
-; SSSE3-NEXT: movd %rax, %xmm2
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i8_to_4i64:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: pmovzxbd (%rdi), %xmm1
-; SSE41-NEXT: pmovzxdq %xmm1, %xmm0
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: movsbq %al, %rax
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: movd %xmm0, %rax
-; SSE41-NEXT: movsbq %al, %rax
-; SSE41-NEXT: movd %rax, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: movsbq %al, %rax
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: movd %xmm1, %rax
-; SSE41-NEXT: movsbq %al, %rax
-; SSE41-NEXT: movd %rax, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
+; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_4i8_to_4i64:
; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movd (%eax), %xmm0
-; X32-SSE41-NEXT: pmovzxbd %xmm0, %xmm1
-; X32-SSE41-NEXT: pmovzxbq %xmm0, %xmm2
-; X32-SSE41-NEXT: movd %xmm2, %eax
-; X32-SSE41-NEXT: movsbl %al, %eax
-; X32-SSE41-NEXT: movd %eax, %xmm0
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax
-; X32-SSE41-NEXT: movsbl %al, %eax
-; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; X32-SSE41-NEXT: movd %xmm2, %eax
-; X32-SSE41-NEXT: movsbl %al, %eax
-; X32-SSE41-NEXT: movd %eax, %xmm1
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
-; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax
-; X32-SSE41-NEXT: movsbl %al, %eax
-; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
+; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1
; X32-SSE41-NEXT: retl
entry:
%X = load <4 x i8>* %ptr
define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
; SSE2-LABEL: load_sext_4i16_to_4i64:
; SSE2: # BB#0: # %entry
-; SSE2-NEXT: movq (%rdi), %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movswq %ax, %rax
+; SSE2-NEXT: movswq 2(%rdi), %rax
+; SSE2-NEXT: movd %rax, %xmm1
+; SSE2-NEXT: movswq (%rdi), %rax
; SSE2-NEXT: movd %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movswq %ax, %rax
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: movswq 6(%rdi), %rax
; SSE2-NEXT: movd %rax, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movswq %ax, %rax
+; SSE2-NEXT: movswq 4(%rdi), %rax
; SSE2-NEXT: movd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: movd %xmm2, %rax
-; SSE2-NEXT: movswq %ax, %rax
-; SSE2-NEXT: movd %rax, %xmm2
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE2-NEXT: retq
;
; SSSE3-LABEL: load_sext_4i16_to_4i64:
; SSSE3: # BB#0: # %entry
-; SSSE3-NEXT: movq (%rdi), %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movswq %ax, %rax
+; SSSE3-NEXT: movswq 2(%rdi), %rax
+; SSSE3-NEXT: movd %rax, %xmm1
+; SSSE3-NEXT: movswq (%rdi), %rax
; SSSE3-NEXT: movd %rax, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movswq %ax, %rax
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: movswq 6(%rdi), %rax
; SSSE3-NEXT: movd %rax, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movswq %ax, %rax
+; SSSE3-NEXT: movswq 4(%rdi), %rax
; SSSE3-NEXT: movd %rax, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSSE3-NEXT: movd %xmm2, %rax
-; SSSE3-NEXT: movswq %ax, %rax
-; SSSE3-NEXT: movd %rax, %xmm2
; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSSE3-NEXT: retq
;
; SSE41-LABEL: load_sext_4i16_to_4i64:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movq (%rdi), %xmm0
-; SSE41-NEXT: pmovzxwd %xmm0, %xmm1
-; SSE41-NEXT: pmovzxwq %xmm0, %xmm0
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: movswq %ax, %rax
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: movd %xmm0, %rax
-; SSE41-NEXT: movswq %ax, %rax
-; SSE41-NEXT: movd %rax, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: movswq %ax, %rax
-; SSE41-NEXT: movd %rax, %xmm2
-; SSE41-NEXT: movd %xmm1, %rax
-; SSE41-NEXT: movswq %ax, %rax
-; SSE41-NEXT: movd %rax, %xmm1
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
+; SSE41-NEXT: pmovsxwq 4(%rdi), %xmm1
; SSE41-NEXT: retq
;
; AVX1-LABEL: load_sext_4i16_to_4i64:
; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
; X32-SSE41: # BB#0: # %entry
; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movsd (%eax), %xmm0
-; X32-SSE41-NEXT: pmovzxwd %xmm0, %xmm1
-; X32-SSE41-NEXT: pmovzxwq %xmm0, %xmm2
-; X32-SSE41-NEXT: movd %xmm2, %eax
-; X32-SSE41-NEXT: cwtl
-; X32-SSE41-NEXT: movd %eax, %xmm0
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax
-; X32-SSE41-NEXT: cwtl
-; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
-; X32-SSE41-NEXT: movd %xmm2, %eax
-; X32-SSE41-NEXT: cwtl
-; X32-SSE41-NEXT: movd %eax, %xmm1
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1
-; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax
-; X32-SSE41-NEXT: cwtl
-; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1
-; X32-SSE41-NEXT: sarl $31, %eax
-; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1
+; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0
+; X32-SSE41-NEXT: pmovsxwq 4(%eax), %xmm1
; X32-SSE41-NEXT: retl
entry:
%X = load <4 x i16>* %ptr
; SSE41-LABEL: load_zext_16i8_to_16i16:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movdqa (%rdi), %xmm1
-; SSE41-NEXT: pmovzxbw %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: punpckhbw %xmm1, %xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pmovzxbw (%rdi), %xmm0
+; SSE41-NEXT: pmovzxbw 8(%rdi), %xmm1
; SSE41-NEXT: retq
; AVX1-LABEL: load_zext_16i8_to_16i16:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxbw (%rdi), %xmm0
+; AVX1-NEXT: vpmovzxbw 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
; SSE41-LABEL: load_zext_8i16_to_8i32:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movdqa (%rdi), %xmm1
-; SSE41-NEXT: pmovzxwd %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: punpckhwd %xmm1, %xmm1 # xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pmovzxwd (%rdi), %xmm0
+; SSE41-NEXT: pmovzxwd 8(%rdi), %xmm1
; SSE41-NEXT: retq
; AVX1-LABEL: load_zext_8i16_to_8i32:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpmovzxwd %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxwd (%rdi), %xmm0
+; AVX1-NEXT: vpmovzxwd 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
; SSE41-LABEL: load_zext_4i32_to_4i64:
; SSE41: # BB#0: # %entry
-; SSE41-NEXT: movdqa (%rdi), %xmm1
-; SSE41-NEXT: pmovzxdq %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: pshufd $250, %xmm1, %xmm1 # xmm1 = xmm1[2,2,3,3]
-; SSE41-NEXT: pand %xmm2, %xmm1
+; SSE41-NEXT: pmovzxdq (%rdi), %xmm0
+; SSE41-NEXT: pmovzxdq 8(%rdi), %xmm1
; SSE41-NEXT: retq
; AVX1-LABEL: load_zext_4i32_to_4i64:
; AVX1: # BB#0: # %entry
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0
+; AVX1-NEXT: vpmovzxdq (%rdi), %xmm0
+; AVX1-NEXT: vpmovzxdq 8(%rdi), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq