From: Craig Topper Date: Sun, 30 Dec 2018 03:05:07 +0000 (+0000) Subject: [X86] Don't mark SEXTLOAD from v4i8/v4i16/v8i8 as Custom on pre-sse4.1. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=eaec59f545e7da7271e60d2a3c88c2bdb48ca676;p=llvm [X86] Don't mark SEXTLOAD from v4i8/v4i16/v8i8 as Custom on pre-sse4.1. This seems to be getting in the way more than its helping. This does mean we stop scalarizing some cases, but I'm not convinced the scalarization was really better. Some of the changes to vsel-cmp-load.ll are a regression but D56156 should fix it. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@350159 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 71f8c3264c7..ddefe05667f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -883,12 +883,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // scalars) and extend in-register to a legal 128-bit vector type. For sext // loads these must work with a single scalar load. for (MVT VT : MVT::integer_vector_valuetypes()) { - if (!ExperimentalVectorWideningLegalization) { - // We don't want narrow result types here when widening. - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom); - setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom); - } setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom); setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom); diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll index 78c6dbe4af0..369108e4ac5 100644 --- a/test/CodeGen/X86/madd.ll +++ b/test/CodeGen/X86/madd.ll @@ -581,16 +581,14 @@ define i32 @_Z9test_charPcS_i_128(i8* nocapture readonly, i8* nocapture readonly ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm2 +; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: pmullw %xmm1, %xmm2 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB4_1 @@ -666,9 +664,9 @@ define i32 @_Z9test_charPcS_i_256(i8* nocapture readonly, i8* nocapture readonly ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 ; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: psraw $8, %xmm3 ; SSE2-NEXT: pmaddwd %xmm2, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 @@ -779,22 +777,20 @@ define i32 @_Z9test_charPcS_i_512(i8* nocapture readonly, i8* nocapture readonly ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB6_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm3 +; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; SSE2-NEXT: psraw $8, %xmm6 +; SSE2-NEXT: pmaddwd %xmm5, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm4 -; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: pmaddwd %xmm3, %xmm5 -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmaddwd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: pmaddwd %xmm3, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm2 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB6_1 @@ -940,45 +936,41 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; SSE2-NEXT: movl %edx, %eax ; SSE2-NEXT: pxor %xmm8, %xmm8 ; SSE2-NEXT: xorl %ecx, %ecx -; SSE2-NEXT: pxor %xmm9, %xmm9 +; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB7_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqu (%rdi,%rcx), %xmm10 +; SSE2-NEXT: movdqu 16(%rdi,%rcx), %xmm7 +; SSE2-NEXT: movdqu (%rsi,%rcx), %xmm9 +; SSE2-NEXT: movdqu 16(%rsi,%rcx), %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15] ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pmaddwd %xmm5, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pmaddwd %xmm5, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm9 -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pmaddwd %xmm6, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pmaddwd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pmaddwd %xmm0, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: pmaddwd %xmm5, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm10[8],xmm0[9],xmm10[9],xmm0[10],xmm10[10],xmm0[11],xmm10[11],xmm0[12],xmm10[12],xmm0[13],xmm10[13],xmm0[14],xmm10[14],xmm0[15],xmm10[15] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: pmaddwd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7] +; SSE2-NEXT: psraw $8, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: pmaddwd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: addq $32, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB7_1 @@ -986,10 +978,10 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; SSE2-NEXT: paddd %xmm8, %xmm4 ; SSE2-NEXT: paddd %xmm8, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm8, %xmm9 +; SSE2-NEXT: paddd %xmm8, %xmm2 ; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] diff --git a/test/CodeGen/X86/pmovsx-inreg.ll b/test/CodeGen/X86/pmovsx-inreg.ll index ea9e296d619..9ab6917966b 100644 --- a/test/CodeGen/X86/pmovsx-inreg.ll +++ b/test/CodeGen/X86/pmovsx-inreg.ll @@ -53,10 +53,8 @@ define void @test2(<4 x i8>* %in, <4 x i64>* %out) nounwind { ; ; AVX1-LABEL: test2: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbq (%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %ymm1, (%rax) @@ -136,10 +134,8 @@ define void @test4(<8 x i8>* %in, <8 x i32>* %out) nounwind { ; ; AVX1-LABEL: test4: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbd (%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %ymm1, (%rax) @@ -300,10 +296,8 @@ define void @test8(<4 x i16>* %in, <4 x i64>* %out) nounwind { ; ; AVX1-LABEL: test8: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwq 4(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwq (%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: vmovdqu %ymm1, (%rax) diff --git a/test/CodeGen/X86/vec_cast.ll b/test/CodeGen/X86/vec_cast.ll index 23870bcf565..6e9a1676895 100644 --- a/test/CodeGen/X86/vec_cast.ll +++ b/test/CodeGen/X86/vec_cast.ll @@ -14,11 +14,10 @@ define <8 x i32> @a(<8 x i16> %a) nounwind { ; ; CHECK-WIN-LABEL: a: ; CHECK-WIN: # %bb.0: -; CHECK-WIN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; CHECK-WIN-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; CHECK-WIN-NEXT: movdqa (%rcx), %xmm1 +; CHECK-WIN-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; CHECK-WIN-NEXT: psrad $16, %xmm0 -; CHECK-WIN-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; CHECK-WIN-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; CHECK-WIN-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; CHECK-WIN-NEXT: psrad $16, %xmm1 ; CHECK-WIN-NEXT: retq %c = sext <8 x i16> %a to <8 x i32> diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index 1ccd636a7c1..a9cb10c7789 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -4278,13 +4278,12 @@ define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) { define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) { ; SSE2-LABEL: sitofp_load_8i16_to_8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -4323,15 +4322,13 @@ define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) { define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) { ; SSE2-LABEL: sitofp_load_8i8_to_8f32: ; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 ; SSE2-NEXT: retq ; @@ -4345,10 +4342,8 @@ define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) { ; ; AVX1-LABEL: sitofp_load_8i8_to_8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbd (%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 ; AVX1-NEXT: retq diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll index 3928c6826a7..cb3a02c2180 100644 --- a/test/CodeGen/X86/vector-sext.ll +++ b/test/CodeGen/X86/vector-sext.ll @@ -1921,30 +1921,28 @@ entry: define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; SSE2-LABEL: load_sext_4i8_to_4i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movsbq 1(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: movsbq (%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movsbq 3(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: movsbq 2(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i8_to_4i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movsbq 1(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm1 -; SSSE3-NEXT: movsbq (%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movsbq 3(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm2 -; SSSE3-NEXT: movsbq 2(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_4i8_to_4i64: @@ -1955,10 +1953,8 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; ; AVX1-LABEL: load_sext_4i8_to_4i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbq (%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -1975,28 +1971,15 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; X32-SSE2-LABEL: load_sext_4i8_to_4i64: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movsbl 1(%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE2-NEXT: movsbl (%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-SSE2-NEXT: movsbl 3(%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X32-SSE2-NEXT: movsbl 2(%eax), %eax -; X32-SSE2-NEXT: movd %eax, %xmm1 -; X32-SSE2-NEXT: sarl $31, %eax -; X32-SSE2-NEXT: movd %eax, %xmm3 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X32-SSE2-NEXT: retl ; ; X32-SSE41-LABEL: load_sext_4i8_to_4i64: @@ -2014,20 +1997,24 @@ entry: define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) { ; SSE2-LABEL: load_sext_4i8_to_4i64_extract: ; SSE2: # %bb.0: -; SSE2-NEXT: movsbq 3(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: movsbq 2(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i8_to_4i64_extract: ; SSSE3: # %bb.0: -; SSSE3-NEXT: movsbq 3(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm1 -; SSSE3-NEXT: movsbq 2(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: pxor %xmm1, %xmm1 +; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_4i8_to_4i64_extract: @@ -2037,9 +2024,7 @@ define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) { ; ; AVX1-LABEL: load_sext_4i8_to_4i64_extract: ; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_sext_4i8_to_4i64_extract: @@ -2059,17 +2044,13 @@ define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) { ; X32-SSE2-LABEL: load_sext_4i8_to_4i64_extract: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movsbl 3(%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE2-NEXT: movsbl 2(%eax), %eax -; X32-SSE2-NEXT: movd %eax, %xmm0 -; X32-SSE2-NEXT: sarl $31, %eax -; X32-SSE2-NEXT: movd %eax, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: pxor %xmm1, %xmm1 +; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X32-SSE2-NEXT: retl ; ; X32-SSE41-LABEL: load_sext_4i8_to_4i64_extract: @@ -2423,50 +2404,42 @@ entry: define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) { ; SSE2-LABEL: load_sext_8i8_to_8i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movsbq 1(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: movsbq (%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movsbq 3(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: movsbq 2(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: movsbq 5(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm3 -; SSE2-NEXT: movsbq 4(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: movsbq 7(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm4 -; SSE2-NEXT: movsbq 6(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: psrad $24, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_8i8_to_8i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movsbq 1(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm1 -; SSSE3-NEXT: movsbq (%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movsbq 3(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm2 -; SSSE3-NEXT: movsbq 2(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSSE3-NEXT: movsbq 5(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm3 -; SSSE3-NEXT: movsbq 4(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSSE3-NEXT: movsbq 7(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm4 -; SSSE3-NEXT: movsbq 6(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm3 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: pxor %xmm4, %xmm4 +; SSSE3-NEXT: pxor %xmm3, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSSE3-NEXT: psrad $24, %xmm3 +; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_8i8_to_8i64: @@ -2479,15 +2452,11 @@ define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) { ; ; AVX1-LABEL: load_sext_8i8_to_8i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbd (%rdi), %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm1 -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm1, %xmm1 +; AVX1-NEXT: vpmovsxbq 6(%rdi), %xmm1 +; AVX1-NEXT: vpmovsxbq 4(%rdi), %xmm2 +; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbq (%rdi), %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; @@ -2505,50 +2474,22 @@ define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) { ; X32-SSE2-LABEL: load_sext_8i8_to_8i64: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movsbl 1(%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE2-NEXT: movsbl (%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-SSE2-NEXT: movsbl 3(%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X32-SSE2-NEXT: movsbl 2(%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X32-SSE2-NEXT: movsbl 5(%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X32-SSE2-NEXT: movsbl 4(%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm4 +; X32-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X32-SSE2-NEXT: psrad $24, %xmm1 +; X32-SSE2-NEXT: pxor %xmm4, %xmm4 +; X32-SSE2-NEXT: pxor %xmm3, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 +; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; X32-SSE2-NEXT: psrad $24, %xmm3 +; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 +; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE2-NEXT: movsbl 7(%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm4 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm3 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; X32-SSE2-NEXT: movsbl 6(%eax), %eax -; X32-SSE2-NEXT: movd %eax, %xmm3 -; X32-SSE2-NEXT: sarl $31, %eax -; X32-SSE2-NEXT: movd %eax, %xmm5 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] ; X32-SSE2-NEXT: retl ; ; X32-SSE41-LABEL: load_sext_8i8_to_8i64: @@ -2882,25 +2823,21 @@ entry: define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) { ; SSE2-LABEL: load_sext_8i8_to_8i32: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $24, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_8i8_to_8i32: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psrad $24, %xmm1 ; SSSE3-NEXT: retq ; @@ -2912,10 +2849,8 @@ define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) { ; ; AVX1-LABEL: load_sext_8i8_to_8i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbd (%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -2932,13 +2867,11 @@ define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) { ; X32-SSE2-LABEL: load_sext_8i8_to_8i32: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: psrad $24, %xmm0 -; X32-SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero ; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X32-SSE2-NEXT: psrad $24, %xmm0 +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: psrad $24, %xmm1 ; X32-SSE2-NEXT: retl ; @@ -5227,21 +5160,19 @@ entry: define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) { ; SSE2-LABEL: load_sext_16i8_to_16i16: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_16i8_to_16i16: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSSE3-NEXT: psraw $8, %xmm0 -; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: psraw $8, %xmm1 ; SSSE3-NEXT: retq ; @@ -5271,11 +5202,10 @@ define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) { ; X32-SSE2-LABEL: load_sext_16i8_to_16i16: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: movdqa (%eax), %xmm1 +; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X32-SSE2-NEXT: psraw $8, %xmm0 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; X32-SSE2-NEXT: psraw $8, %xmm1 ; X32-SSE2-NEXT: retl ; @@ -5391,30 +5321,26 @@ entry: define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; SSE2-LABEL: load_sext_4i16_to_4i64: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movswq 2(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: movswq (%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movswq 6(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: movswq 4(%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i16_to_4i64: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movswq 2(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm1 -; SSSE3-NEXT: movswq (%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movswq 6(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm2 -; SSSE3-NEXT: movswq 4(%rdi), %rax -; SSSE3-NEXT: movq %rax, %xmm1 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: psrad $16, %xmm1 +; SSSE3-NEXT: pxor %xmm2, %xmm2 +; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_4i16_to_4i64: @@ -5425,10 +5351,8 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; ; AVX1-LABEL: load_sext_4i16_to_4i64: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxwd (%rdi), %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 +; AVX1-NEXT: vpmovsxwq 4(%rdi), %xmm0 +; AVX1-NEXT: vpmovsxwq (%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -5445,28 +5369,14 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; X32-SSE2-LABEL: load_sext_4i16_to_4i64: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movswl 2(%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE2-NEXT: movswl (%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 +; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X32-SSE2-NEXT: psrad $16, %xmm1 +; X32-SSE2-NEXT: pxor %xmm2, %xmm2 +; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 +; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-SSE2-NEXT: movswl 6(%eax), %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: sarl $31, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X32-SSE2-NEXT: movswl 4(%eax), %eax -; X32-SSE2-NEXT: movd %eax, %xmm1 -; X32-SSE2-NEXT: sarl $31, %eax -; X32-SSE2-NEXT: movd %eax, %xmm3 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X32-SSE2-NEXT: retl ; ; X32-SSE41-LABEL: load_sext_4i16_to_4i64: @@ -5484,21 +5394,19 @@ entry: define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) { ; SSE2-LABEL: load_sext_8i16_to_8i32: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_8i16_to_8i32: ; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa (%rdi), %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psrad $16, %xmm1 ; SSSE3-NEXT: retq ; @@ -5528,11 +5436,10 @@ define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) { ; X32-SSE2-LABEL: load_sext_8i16_to_8i32: ; X32-SSE2: # %bb.0: # %entry ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: movdqa (%eax), %xmm1 +; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X32-SSE2-NEXT: psrad $16, %xmm0 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; X32-SSE2-NEXT: psrad $16, %xmm1 ; X32-SSE2-NEXT: retl ; diff --git a/test/CodeGen/X86/vsel-cmp-load.ll b/test/CodeGen/X86/vsel-cmp-load.ll index d317377a93a..c7ee24b3d09 100644 --- a/test/CodeGen/X86/vsel-cmp-load.ll +++ b/test/CodeGen/X86/vsel-cmp-load.ll @@ -115,9 +115,12 @@ define <8 x i32> @slt_zero(<8 x i8>* %p, <8 x i32> %x, <8 x i32> %y) { ; AVX1-LABEL: slt_zero: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxbw (%rdi), %xmm2 -; AVX1-NEXT: vpmovsxwd %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; AVX1-NEXT: vpslld $24, %xmm3, %xmm3 +; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; AVX1-NEXT: vpslld $24, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -215,12 +218,11 @@ define <4 x double> @sgt_zero_fp_select(<4 x i8>* %p, <4 x double> %x, <4 x doub ; AVX1-LABEL: sgt_zero_fp_select: ; AVX1: # %bb.0: ; AVX1-NEXT: vpmovsxbd (%rdi), %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpmovsxdq %xmm2, %xmm3 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq