NewMask);
}
+/// If the shuffle mask is taking exactly one element from the first vector
+/// operand and passing through all other elements from the second vector
+/// operand, return the index of the mask element that is choosing an element
+/// from the first operand. Otherwise, return -1.
+static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) {
+ int MaskSize = Mask.size();
+ int EltFromOp0 = -1;
+ // TODO: This does not match if there are undef elements in the shuffle mask.
+ // Should we ignore undefs in the shuffle mask instead? The trade-off is
+ // removing an instruction (a shuffle), but losing the knowledge that some
+ // vector lanes are not needed.
+ for (int i = 0; i != MaskSize; ++i) {
+ if (Mask[i] >= 0 && Mask[i] < MaskSize) {
+ // We're looking for a shuffle of exactly one element from operand 0.
+ if (EltFromOp0 != -1)
+ return -1;
+ EltFromOp0 = i;
+ } else if (Mask[i] != i + MaskSize) {
+ // Nothing from operand 1 can change lanes.
+ return -1;
+ }
+ }
+ return EltFromOp0;
+}
+
+/// If a shuffle inserts exactly one element from a source vector operand into
+/// another vector operand and we can access the specified element as a scalar,
+/// then we can eliminate the shuffle.
+static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
+ SelectionDAG &DAG) {
+ // First, check if we are taking one element of a vector and shuffling that
+ // element into another vector.
+ ArrayRef<int> Mask = Shuf->getMask();
+ SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end());
+ SDValue Op0 = Shuf->getOperand(0);
+ SDValue Op1 = Shuf->getOperand(1);
+ int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask);
+ if (ShufOp0Index == -1) {
+ // Commute mask and check again.
+ ShuffleVectorSDNode::commuteMask(CommutedMask);
+ ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask);
+ if (ShufOp0Index == -1)
+ return SDValue();
+ // Commute operands to match the commuted shuffle mask.
+ std::swap(Op0, Op1);
+ Mask = CommutedMask;
+ }
+
+ // The shuffle inserts exactly one element from operand 0 into operand 1.
+ // Now see if we can access that element as a scalar via a real insert element
+ // instruction.
+ // TODO: We can try harder to locate the element as a scalar. Examples: it
+ // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant.
+ assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() &&
+ "Shuffle mask value must be from operand 0");
+ if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT)
+ return SDValue();
+
+ auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2));
+ if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index])
+ return SDValue();
+
+ // There's an existing insertelement with constant insertion index, so we
+ // don't need to check the legality/profitability of a replacement operation
+ // that differs at most in the constant value. The target should be able to
+ // lower any of those in a similar way. If not, legalization will expand this
+ // to a scalar-to-vector plus shuffle.
+ //
+ // Note that the shuffle may move the scalar from the position that the insert
+ // element used. Therefore, our new insert element occurs at the shuffle's
+ // mask index value, not the insert's index value.
+ // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
+ SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf),
+ Op0.getOperand(2).getValueType());
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
+ Op1, Op0.getOperand(1), NewInsIndex);
+}
+
SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
EVT VT = N->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
if (SDValue V = simplifyShuffleMask(SVN, N0, N1, DAG))
return V;
+ if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG))
+ return InsElt;
+
// A shuffle of a single vector that is a splat can always be folded.
if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0))
if (N1->isUndef() && N0Shuf->isSplat())
;
; SSE4-LABEL: ins_elt_0:
; SSE4: # %bb.0:
-; SSE4-NEXT: pinsrd $0, %edi, %xmm0
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE4-NEXT: pinsrd $0, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: ins_elt_0:
; AVX: # %bb.0:
-; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT: vpinsrd $0, %edi, %xmm1, %xmm0
; AVX-NEXT: retq
%ins = insertelement <4 x i32> %v1, i32 %x, i32 0
%shuf = shufflevector <4 x i32> %ins, <4 x i32> %v2, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
define <4 x i32> @ins_elt_1(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
; SSE2-LABEL: ins_elt_1:
; SSE2: # %bb.0:
-; SSE2-NEXT: movd %edi, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,3]
-; SSE2-NEXT: movaps %xmm2, %xmm0
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE4-LABEL: ins_elt_1:
; SSE4: # %bb.0:
-; SSE4-NEXT: pinsrd $1, %edi, %xmm0
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; SSE4-NEXT: pinsrd $1, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: ins_elt_1:
; AVX: # %bb.0:
-; AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX-NEXT: vpinsrd $1, %edi, %xmm1, %xmm0
; AVX-NEXT: retq
%ins = insertelement <4 x i32> %v1, i32 %x, i32 1
%shuf = shufflevector <4 x i32> %ins, <4 x i32> %v2, <4 x i32> <i32 4, i32 1, i32 6, i32 7>
define <4 x i32> @ins_elt_2_commute(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
; SSE2-LABEL: ins_elt_2_commute:
; SSE2: # %bb.0:
-; SSE2-NEXT: movd %edi, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[3,0]
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: ins_elt_2_commute:
; SSE4: # %bb.0:
-; SSE4-NEXT: pinsrd $2, %edi, %xmm0
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; SSE4-NEXT: pinsrd $2, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: ins_elt_2_commute:
; AVX: # %bb.0:
-; AVX-NEXT: vpinsrd $2, %edi, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; AVX-NEXT: vpinsrd $2, %edi, %xmm1, %xmm0
; AVX-NEXT: retq
%ins = insertelement <4 x i32> %v1, i32 %x, i32 2
%shuf = shufflevector <4 x i32> %v2, <4 x i32> %ins, <4 x i32> <i32 0, i32 1, i32 6, i32 3>
define <4 x i32> @ins_elt_3_commute(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
; SSE2-LABEL: ins_elt_3_commute:
; SSE2: # %bb.0:
-; SSE2-NEXT: movd %edi, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[2,0]
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: ins_elt_3_commute:
; SSE4: # %bb.0:
-; SSE4-NEXT: pinsrd $3, %edi, %xmm0
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; SSE4-NEXT: pinsrd $3, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: ins_elt_3_commute:
; AVX: # %bb.0:
-; AVX-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; AVX-NEXT: vpinsrd $3, %edi, %xmm1, %xmm0
; AVX-NEXT: retq
%ins = insertelement <4 x i32> %v1, i32 %x, i32 3
%shuf = shufflevector <4 x i32> %v2, <4 x i32> %ins, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
;
; SSE4-LABEL: ins_elt_0_to_2:
; SSE4: # %bb.0:
-; SSE4-NEXT: pinsrd $0, %edi, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; SSE4-NEXT: pinsrd $2, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: ins_elt_0_to_2:
; AVX: # %bb.0:
-; AVX-NEXT: vpinsrd $0, %edi, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
+; AVX-NEXT: vpinsrd $2, %edi, %xmm1, %xmm0
; AVX-NEXT: retq
%ins = insertelement <4 x i32> %v1, i32 %x, i32 0
%shuf = shufflevector <4 x i32> %ins, <4 x i32> %v2, <4 x i32> <i32 4, i32 5, i32 0, i32 7>
;
; SSE4-LABEL: ins_elt_1_to_0:
; SSE4: # %bb.0:
-; SSE4-NEXT: pinsrd $1, %edi, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; SSE4-NEXT: pinsrd $0, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: ins_elt_1_to_0:
; AVX: # %bb.0:
-; AVX-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
+; AVX-NEXT: vpinsrd $0, %edi, %xmm1, %xmm0
; AVX-NEXT: retq
%ins = insertelement <4 x i32> %v1, i32 %x, i32 1
%shuf = shufflevector <4 x i32> %ins, <4 x i32> %v2, <4 x i32> <i32 1, i32 5, i32 6, i32 7>
define <4 x i32> @ins_elt_2_to_3(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
; SSE2-LABEL: ins_elt_2_to_3:
; SSE2: # %bb.0:
-; SSE2-NEXT: movd %edi, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0,2]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,0]
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: retq
;
; SSE4-LABEL: ins_elt_2_to_3:
; SSE4: # %bb.0:
-; SSE4-NEXT: pinsrd $2, %edi, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; SSE4-NEXT: pinsrd $3, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: ins_elt_2_to_3:
; AVX: # %bb.0:
-; AVX-NEXT: vpinsrd $2, %edi, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; AVX-NEXT: vpinsrd $3, %edi, %xmm1, %xmm0
; AVX-NEXT: retq
%ins = insertelement <4 x i32> %v1, i32 %x, i32 2
%shuf = shufflevector <4 x i32> %v2, <4 x i32> %ins, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
define <4 x i32> @ins_elt_3_to_1(i32 %x, <4 x i32> %v1, <4 x i32> %v2) {
; SSE2-LABEL: ins_elt_3_to_1:
; SSE2: # %bb.0:
-; SSE2-NEXT: movd %edi, %xmm2
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm0[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,0],xmm1[0,0]
+; SSE2-NEXT: movd %edi, %xmm0
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE4-LABEL: ins_elt_3_to_1:
; SSE4: # %bb.0:
-; SSE4-NEXT: pinsrd $3, %edi, %xmm0
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE4-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; SSE4-NEXT: pinsrd $1, %edi, %xmm1
+; SSE4-NEXT: movdqa %xmm1, %xmm0
; SSE4-NEXT: retq
;
; AVX-LABEL: ins_elt_3_to_1:
; AVX: # %bb.0:
-; AVX-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
+; AVX-NEXT: vpinsrd $1, %edi, %xmm1, %xmm0
; AVX-NEXT: retq
%ins = insertelement <4 x i32> %v1, i32 %x, i32 3
%shuf = shufflevector <4 x i32> %v2, <4 x i32> %ins, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
;
; SSE41-LABEL: insert_reg_lo_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq %rdi, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pinsrq $0, %rdi, %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: insert_reg_lo_v2i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq %rdi, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_reg_lo_v2i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovq %rdi, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_reg_lo_v2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovq %rdi, %xmm1
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_reg_lo_v2i64:
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0
+; AVX-NEXT: retq
%v = insertelement <2 x i64> undef, i64 %a, i32 0
%shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
ret <2 x i64> %shuffle
;
; SSE41-LABEL: insert_mem_lo_v2i64:
; SSE41: # %bb.0:
-; SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
+; SSE41-NEXT: pinsrq $0, (%rdi), %xmm0
; SSE41-NEXT: retq
;
-; AVX1-LABEL: insert_mem_lo_v2i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: insert_mem_lo_v2i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX2-NEXT: retq
-;
-; AVX512VL-LABEL: insert_mem_lo_v2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512VL-NEXT: retq
+; AVX-LABEL: insert_mem_lo_v2i64:
+; AVX: # %bb.0:
+; AVX-NEXT: vpinsrq $0, (%rdi), %xmm0, %xmm0
+; AVX-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <2 x i64> undef, i64 %a, i32 0
%shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 0, i32 3>
}
define <2 x i64> @insert_reg_hi_v2i64(i64 %a, <2 x i64> %b) {
-; SSE-LABEL: insert_reg_hi_v2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movq %rdi, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: retq
+; SSE2-LABEL: insert_reg_hi_v2i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq %rdi, %xmm1
+; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_reg_hi_v2i64:
+; SSE3: # %bb.0:
+; SSE3-NEXT: movq %rdi, %xmm1
+; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_reg_hi_v2i64:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movq %rdi, %xmm1
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_reg_hi_v2i64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pinsrq $1, %rdi, %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: insert_reg_hi_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovq %rdi, %xmm1
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; AVX-NEXT: retq
%v = insertelement <2 x i64> undef, i64 %a, i32 0
%shuffle = shufflevector <2 x i64> %v, <2 x i64> %b, <2 x i32> <i32 2, i32 0>
}
define <2 x i64> @insert_mem_hi_v2i64(i64* %ptr, <2 x i64> %b) {
-; SSE-LABEL: insert_mem_hi_v2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: retq
+; SSE2-LABEL: insert_mem_hi_v2i64:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: insert_mem_hi_v2i64:
+; SSE3: # %bb.0:
+; SSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: insert_mem_hi_v2i64:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
+; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: insert_mem_hi_v2i64:
+; SSE41: # %bb.0:
+; SSE41-NEXT: pinsrq $1, (%rdi), %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: insert_mem_hi_v2i64:
; AVX: # %bb.0:
-; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT: vpinsrq $1, (%rdi), %xmm0, %xmm0
; AVX-NEXT: retq
%a = load i64, i64* %ptr
%v = insertelement <2 x i64> undef, i64 %a, i32 0