From: Michael Kuperstein Date: Thu, 6 Apr 2017 22:33:25 +0000 (+0000) Subject: [X86] Revert r299387 due to AVX legalization infinite loop. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=bf82f16ca46c57496f783d9a6f2ddfbbc3fab34b;p=llvm [X86] Revert r299387 due to AVX legalization infinite loop. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@299720 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9cbdfac6497..c7b01fa4eb4 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -6120,54 +6120,6 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, return SDValue(); } -// Attempt to lower a build vector of repeated elts as a build vector of unique -// ops followed by a shuffle. -static SDValue -lowerBuildVectorWithRepeatedEltsUsingShuffle(SDValue V, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - MVT VT = V.getSimpleValueType(); - unsigned NumElts = VT.getVectorNumElements(); - - // TODO - vXi8 insertions+shuffles often cause PSHUFBs which can lead to - // excessive/bulky shuffle mask creation. - if (VT.getScalarSizeInBits() < 16) - return SDValue(); - - // Create list of unique operands to be passed to a build vector and a shuffle - // mask describing the repetitions. - // TODO - we currently insert the first occurances in place - sometimes it - // might be better to insert them in other locations for shuffle efficiency. - bool HasRepeatedElts = false; - SmallVector Mask(NumElts, SM_SentinelUndef); - SmallVector Uniques(V->op_begin(), V->op_end()); - for (unsigned i = 0; i != NumElts; ++i) { - SDValue Op = Uniques[i]; - if (Op.isUndef()) - continue; - Mask[i] = i; - - // Zeros can be efficiently repeated, so don't shuffle these. - if (X86::isZeroNode(Op)) - continue; - - // If any repeated operands are found then mark the build vector entry as - // undef and setup a copy in the shuffle mask. - for (unsigned j = i + 1; j != NumElts; ++j) - if (Op == Uniques[j]) { - HasRepeatedElts = true; - Mask[j] = i; - Uniques[j] = DAG.getUNDEF(VT.getScalarType()); - } - } - - if (!HasRepeatedElts) - return SDValue(); - - SDLoc DL(V); - return DAG.getVectorShuffle(VT, DL, DAG.getBuildVector(VT, DL, Uniques), - DAG.getUNDEF(VT), Mask); -} - /// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, @@ -7800,17 +7752,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (IsAllConstants) return SDValue(); + // See if we can use a vector load to get all of the elements. if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) { - // See if we can use a vector load to get all of the elements. SmallVector Ops(Op->op_begin(), Op->op_begin() + NumElems); if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false)) return LD; - - // Attempt to lower a build vector of repeated elts as single insertions - // followed by a shuffle. - if (SDValue V = - lowerBuildVectorWithRepeatedEltsUsingShuffle(Op, DAG, Subtarget)) - return V; } // For AVX-length vectors, build the individual 128-bit pieces and use diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index d8a92f8eedd..4a86fa22f08 100644 --- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -2425,9 +2425,12 @@ define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind { define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind { ; X32-LABEL: test_mm256_set1_epi64x: ; X32: # BB#0: -; X32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovd %ecx, %xmm0 +; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll index 620603925d0..37b8753097c 100644 --- a/test/CodeGen/X86/avx-vbroadcast.ll +++ b/test/CodeGen/X86/avx-vbroadcast.ll @@ -6,8 +6,12 @@ define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp { ; X32-LABEL: A: ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %eax +; X32-NEXT: vmovd %ecx, %xmm0 +; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; @@ -27,21 +31,17 @@ entry: define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp { ; X32-LABEL: A2: ; X32: ## BB#0: ## %entry -; X32-NEXT: pushl %esi -; X32-NEXT: Lcfi0: -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: Lcfi1: -; X32-NEXT: .cfi_offset %esi, -8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl (%ecx), %edx -; X32-NEXT: movl 4(%ecx), %esi -; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movl %esi, 4(%eax) +; X32-NEXT: movl 4(%ecx), %ecx +; X32-NEXT: movl %ecx, 4(%eax) ; X32-NEXT: movl %edx, (%eax) -; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; X32-NEXT: vmovd %edx, %xmm0 +; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: A2: @@ -592,8 +592,12 @@ define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp { ; X32-LABEL: G: ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %eax +; X32-NEXT: vmovd %ecx, %xmm0 +; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: G: @@ -611,20 +615,16 @@ entry: define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp { ; X32-LABEL: G2: ; X32: ## BB#0: ## %entry -; X32-NEXT: pushl %esi -; X32-NEXT: Lcfi2: -; X32-NEXT: .cfi_def_cfa_offset 8 -; X32-NEXT: Lcfi3: -; X32-NEXT: .cfi_offset %esi, -8 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl (%ecx), %edx -; X32-NEXT: movl 4(%ecx), %esi -; X32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movl %esi, 4(%eax) +; X32-NEXT: movl 4(%ecx), %ecx +; X32-NEXT: movl %ecx, 4(%eax) ; X32-NEXT: movl %edx, (%eax) -; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; X32-NEXT: popl %esi +; X32-NEXT: vmovd %edx, %xmm0 +; X32-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: G2: diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll index 5df58f9fbae..ba47e2ba15c 100644 --- a/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -189,7 +189,12 @@ define <2 x i64> @Q64(i64* %ptr) nounwind uwtable readnone ssp { ; X32-LABEL: Q64: ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vpbroadcastq (%eax), %xmm0 +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %eax +; X32-NEXT: vmovd %ecx, %xmm0 +; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; X32-NEXT: retl ; ; X64-LABEL: Q64: @@ -207,8 +212,13 @@ define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp { ; X32-LABEL: QQ64: ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vbroadcastsd %xmm0, %ymm0 +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %eax +; X32-NEXT: vmovd %ecx, %xmm0 +; X32-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 +; X32-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 +; X32-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: QQ64: @@ -1430,8 +1440,12 @@ define void @isel_crash_2q(i64* %cV_R.addr) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; X32-NEXT: vmovaps %xmm0, (%esp) -; X32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: vpbroadcastq %xmm1, %xmm1 +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %eax +; X32-NEXT: vmovd %ecx, %xmm1 +; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; X32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: vmovdqa %xmm1, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp @@ -1487,10 +1501,15 @@ define void @isel_crash_4q(i64* %cV_R.addr) { ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: vxorps %ymm0, %ymm0, %ymm0 ; X32-NEXT: vmovaps %ymm0, (%esp) -; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: vbroadcastsd %xmm1, %ymm1 +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %eax +; X32-NEXT: vmovd %ecx, %xmm1 +; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-NEXT: vpinsrd $2, %ecx, %xmm1, %xmm1 +; X32-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 +; X32-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm1 ; X32-NEXT: vmovaps %ymm0, {{[0-9]+}}(%esp) -; X32-NEXT: vmovaps %ymm1, {{[0-9]+}}(%esp) +; X32-NEXT: vmovdqa %ymm1, {{[0-9]+}}(%esp) ; X32-NEXT: movl %ebp, %esp ; X32-NEXT: popl %ebp ; X32-NEXT: vzeroupper diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll index b4aae93143a..71417694b0d 100644 --- a/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -1102,44 +1102,28 @@ define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable n ; define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwtable noinline ssp { -; SSE2-LABEL: merge_4f32_f32_X0YY: -; SSE2: # BB#0: -; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: merge_4f32_f32_X0YY: -; SSE41: # BB#0: -; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,mem[0],zero -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE41-NEXT: retq +; SSE-LABEL: merge_4f32_f32_X0YY: +; SSE: # BB#0: +; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; SSE-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_X0YY: ; AVX: # BB#0: ; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,mem[0],zero -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0,0] ; AVX-NEXT: retq ; -; X32-SSE1-LABEL: merge_4f32_f32_X0YY: -; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] -; X32-SSE1-NEXT: retl -; -; X32-SSE41-LABEL: merge_4f32_f32_X0YY: -; X32-SSE41: # BB#0: -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,mem[0],zero -; X32-SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,2,2] -; X32-SSE41-NEXT: retl +; X32-SSE-LABEL: merge_4f32_f32_X0YY: +; X32-SSE: # BB#0: +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; X32-SSE-NEXT: retl %val0 = load float, float* %ptr0, align 4 %val1 = load float, float* %ptr1, align 4 %res0 = insertelement <4 x float> undef, float %val0, i32 0 diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index 030ad7683f0..3071155172e 100644 --- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2425,9 +2425,10 @@ define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind { ; X32-LABEL: test_mm_set1_epi64x: ; X32: # BB#0: ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_set1_epi64x: diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll index 8baef924521..a345f78e18c 100644 --- a/test/CodeGen/X86/vec_fp_to_int.ll +++ b/test/CodeGen/X86/vec_fp_to_int.ll @@ -537,7 +537,7 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { ; VEX-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 ; VEX-NEXT: vcvttsd2si %xmm0, %rax ; VEX-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 -; VEX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] +; VEX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: fptoui_4f64_to_2i32: diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index 35d935b107f..649b45712f5 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -1177,8 +1177,8 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE-NEXT: movd %xmm0, %rax ; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,2,2] ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -1879,8 +1879,8 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; SSE-NEXT: cvtsi2ssq %rax, %xmm1 ; SSE-NEXT: addss %xmm1, %xmm1 ; SSE-NEXT: .LBB41_8: +; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,2,2] ; SSE-NEXT: retq ; ; VEX-LABEL: uitofp_4i64_to_4f32_undef: diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index 914dc7423a2..e9f1d1d8522 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -1263,13 +1263,14 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE41-NEXT: movzbl (%eax), %eax ; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $30, %ecx +; X32-SSE41-NEXT: shll $31, %ecx ; X32-SSE41-NEXT: sarl $31, %ecx -; X32-SSE41-NEXT: shll $31, %eax +; X32-SSE41-NEXT: movd %ecx, %xmm0 +; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 +; X32-SSE41-NEXT: shll $30, %eax ; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: movd %eax, %xmm0 -; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 +; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 ; X32-SSE41-NEXT: retl entry: %X = load <2 x i1>, <2 x i1>* %ptr diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll index 91d31173933..a9dff916431 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -318,20 +318,21 @@ define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) { ret <4 x i32> %res3 } +; FIXME: Duplicated load in i686 define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) { ; X32-LABEL: buildvector_v4f32_0404: ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; X32-NEXT: vmovapd %xmm0, (%eax) +; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] +; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] +; X32-NEXT: vmovaps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: buildvector_v4f32_0404: ; X64: # BB#0: -; X64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; X64-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; X64-NEXT: vmovapd %xmm0, (%rdi) +; X64-NEXT: vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[0],xmm1[0] +; X64-NEXT: vmovaps %xmm0, (%rdi) ; X64-NEXT: retq %v0 = insertelement <4 x float> undef, float %a, i32 0 %v1 = insertelement <4 x float> %v0, float %b, i32 1 diff --git a/test/CodeGen/X86/vshift-1.ll b/test/CodeGen/X86/vshift-1.ll index 4bc65908ed6..7ad5706592e 100644 --- a/test/CodeGen/X86/vshift-1.ll +++ b/test/CodeGen/X86/vshift-1.ll @@ -28,9 +28,12 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind { ; X32-LABEL: shift1b: ; X32: # BB#0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; X32-NEXT: psllq %xmm1, %xmm0 +; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-NEXT: psllq %xmm2, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) ; X32-NEXT: retl ; diff --git a/test/CodeGen/X86/vshift-2.ll b/test/CodeGen/X86/vshift-2.ll index 024727323ac..f79fc5bff96 100644 --- a/test/CodeGen/X86/vshift-2.ll +++ b/test/CodeGen/X86/vshift-2.ll @@ -28,9 +28,12 @@ define void @shift1b(<2 x i64> %val, <2 x i64>* %dst, i64 %amt) nounwind { ; X32-LABEL: shift1b: ; X32: # BB#0: # %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; X32-NEXT: psrlq %xmm1, %xmm0 +; X32-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] +; X32-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,1,1] +; X32-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; X32-NEXT: psrlq %xmm2, %xmm0 ; X32-NEXT: movdqa %xmm0, (%eax) ; X32-NEXT: retl ;