SDLoc dl(Op);
SDValue V;
- bool First = true;
// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
- for (unsigned i = 0; i < 16; ++i) {
+ for (unsigned i = 0; i < 16; i += 2) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
- if (ThisIsNonZero && First) {
- if (NumZero)
- V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ bool NextIsNonZero = (NonZeros & (1 << (i + 1))) != 0;
+ if (!ThisIsNonZero && !NextIsNonZero)
+ continue;
+
+ // FIXME: Investigate combining the first 4 bytes as a i32 instead.
+ SDValue Elt;
+ if (ThisIsNonZero) {
+ if (NumZero || NextIsNonZero)
+ Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
else
- V = DAG.getUNDEF(MVT::v8i16);
- First = false;
+ Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
}
- if ((i & 1) != 0) {
- // FIXME: Investigate extending to i32 instead of just i16.
- // FIXME: Investigate combining the first 4 bytes as a i32 instead.
- SDValue ThisElt, LastElt;
- bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
- if (LastIsNonZero) {
- LastElt =
- DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
- }
- if (ThisIsNonZero) {
- ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
- ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
- DAG.getConstant(8, dl, MVT::i8));
- if (LastIsNonZero)
- ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
- } else
- ThisElt = LastElt;
-
- if (ThisElt) {
- if (1 == i) {
- V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
- : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
- V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
- if (NumZero)
- V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
- V = DAG.getBitcast(MVT::v8i16, V);
- } else {
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
- DAG.getIntPtrConstant(i / 2, dl));
- }
+ if (NextIsNonZero) {
+ SDValue NextElt;
+ if (i == 0 && NumZero)
+ NextElt = DAG.getZExtOrTrunc(Op.getOperand(i+1), dl, MVT::i32);
+ else
+ NextElt = DAG.getAnyExtOrTrunc(Op.getOperand(i+1), dl, MVT::i32);
+ NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt,
+ DAG.getConstant(8, dl, MVT::i8));
+ if (ThisIsNonZero)
+ Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt);
+ else
+ Elt = NextElt;
+ }
+
+ // If our first insertion is not the first index then insert into zero
+ // vector to break any register dependency else use SCALAR_TO_VECTOR.
+ if (!V) {
+ if (i != 0)
+ V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+ else {
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
+ V = DAG.getBitcast(MVT::v8i16, V);
+ continue;
}
}
+ Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt);
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt,
+ DAG.getIntPtrConstant(i / 2, dl));
}
return DAG.getBitcast(MVT::v16i8, V);
define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) {
; SSE2-LABEL: test_buildvector_v16i8_partial:
; SSE2: # %bb.0:
-; SSE2-NEXT: movzbl %dil, %eax
-; SSE2-NEXT: pinsrw $1, %eax, %xmm0
-; SSE2-NEXT: movzbl %sil, %eax
-; SSE2-NEXT: pinsrw $3, %eax, %xmm0
-; SSE2-NEXT: movzbl %dl, %eax
-; SSE2-NEXT: pinsrw $4, %eax, %xmm0
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pinsrw $1, %edi, %xmm0
+; SSE2-NEXT: pinsrw $3, %esi, %xmm0
+; SSE2-NEXT: pinsrw $4, %edx, %xmm0
; SSE2-NEXT: shll $8, %ecx
; SSE2-NEXT: pinsrw $5, %ecx, %xmm0
-; SSE2-NEXT: movzbl %r8b, %eax
-; SSE2-NEXT: pinsrw $6, %eax, %xmm0
+; SSE2-NEXT: pinsrw $6, %r8d, %xmm0
; SSE2-NEXT: shll $8, %r9d
; SSE2-NEXT: pinsrw $7, %r9d, %xmm0
; SSE2-NEXT: retq
define <3 x i16> @sext_i8(<3 x i8>) {
; SSE3-LABEL: sext_i8:
; SSE3: # %bb.0:
-; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
-; SSE3-NEXT: movzbl {{[0-9]+}}(%esp), %edx
-; SSE3-NEXT: movd %edx, %xmm0
-; SSE3-NEXT: pinsrw $1, %ecx, %xmm0
+; SSE3-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE3-NEXT: pinsrw $1, %eax, %xmm0
+; SSE3-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE3-NEXT: pinsrw $2, %eax, %xmm0
; SSE3-NEXT: psllw $8, %xmm0
; SSE3-NEXT: psraw $8, %xmm0
; SSE2-NEXT: movzbl (%rsi), %ecx
; SSE2-NEXT: shll $8, %ecx
; SSE2-NEXT: orl %eax, %ecx
-; SSE2-NEXT: movzwl %cx, %eax
-; SSE2-NEXT: movd %eax, %xmm1
+; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,3,4,5,6,7]
; X86-SSE2-NEXT: movl (%esp), %edx
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-SSE2-NEXT: shll $8, %edx
+; X86-SSE2-NEXT: pxor %xmm0, %xmm0
; X86-SSE2-NEXT: pinsrw $1, %edx, %xmm0
; X86-SSE2-NEXT: shll $8, %esi
; X86-SSE2-NEXT: pinsrw $3, %esi, %xmm0
; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %eax
; X64-SSE2-NEXT: movl -{{[0-9]+}}(%rsp), %ecx
; X64-SSE2-NEXT: shll $8, %eax
+; X64-SSE2-NEXT: pxor %xmm0, %xmm0
; X64-SSE2-NEXT: pinsrw $1, %eax, %xmm0
; X64-SSE2-NEXT: shll $8, %ecx
; X64-SSE2-NEXT: pinsrw $3, %ecx, %xmm0