From: Craig Topper Date: Mon, 19 Aug 2019 18:15:50 +0000 (+0000) Subject: [X86] Teach lowerV4I32Shuffle to only use broadcasts if the mask has more than one... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d00a1ad97c5e51701d5746e0f12a9cb352451edc;p=llvm [X86] Teach lowerV4I32Shuffle to only use broadcasts if the mask has more than one undef element. Prioritize shifts over broadcast in lowerV8I16Shuffle. The motivating case are the changes in vector-reduce-add.ll where we were doing extra work in the scalar domain instead of shuffling. There may be some one use check that needs to be looked into there, but this patch sidesteps the issue by avoiding broadcasts that aren't really broadcasting. Differential Revision: https://reviews.llvm.org/D66071 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@369287 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f0a4cf2aef8..a519c200e49 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13114,10 +13114,12 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef Mask, int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 0) { - // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, - Mask, Subtarget, DAG)) - return Broadcast; + // Try to use broadcast unless the mask only has one non-undef element. + if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) { + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return Broadcast; + } // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. @@ -13798,16 +13800,16 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef Mask, int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); if (NumV2Inputs == 0) { - // Check for being able to broadcast a single element. - if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, - Mask, Subtarget, DAG)) - return Broadcast; - // Try to use shift instructions. if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable, Subtarget, DAG)) return Shift; + // Check for being able to broadcast a single element. + if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, + Mask, Subtarget, DAG)) + return Broadcast; + // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index f5047b1d640..e2819151ce7 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -2481,23 +2481,23 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: # xmm7 = mem[0],zero ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: vpbroadcastw %xmm8, %xmm8 +; AVX2-NEXT: vpslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1] ; AVX2-NEXT: vpbroadcastw %xmm9, %xmm0 ; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0,1,2,3,4,5,6],xmm8[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] ; AVX2-NEXT: vpbroadcastw %xmm9, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 +; AVX2-NEXT: vpsllq $48, %xmm1, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm2, %xmm2 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3],xmm2[4,5,6,7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] ; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; AVX2-NEXT: vpbroadcastw %xmm3, %xmm3 +; AVX2-NEXT: vpslld $16, %xmm3, %xmm3 ; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5,6,7] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] diff --git a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll index b3e154b3107..887e4747e78 100644 --- a/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -225,12 +225,11 @@ define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask1(<16 x i16>* %vp, define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm4 = [8,0,3,0,5,0,7,1] -; CHECK-NEXT: vpermi2w %xmm2, %xmm3, %xmm4 +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm3 = [9,0,3,0,5,0,7,1] +; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm3 ; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 -; CHECK-NEXT: vmovdqu16 %xmm4, %xmm0 {%k1} +; CHECK-NEXT: vmovdqu16 %xmm3, %xmm0 {%k1} ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> @@ -242,11 +241,10 @@ define <8 x i16> @test_masked_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 define <8 x i16> @test_masked_z_16xi16_to_8xi16_perm_mem_mask2(<16 x i16>* %vp, <8 x i16> %mask) { ; CHECK-LABEL: test_masked_z_16xi16_to_8xi16_perm_mem_mask2: ; CHECK: # %bb.0: -; CHECK-NEXT: vpbroadcastw 2(%rdi), %xmm2 -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3 -; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [8,0,3,0,5,0,7,1] +; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 +; CHECK-NEXT: vmovdqa {{.*#+}} xmm1 = [9,0,3,0,5,0,7,1] ; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 -; CHECK-NEXT: vpermi2w %xmm2, %xmm3, %xmm1 {%k1} {z} +; CHECK-NEXT: vpermi2w (%rdi), %xmm2, %xmm1 {%k1} {z} ; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i16>, <16 x i16>* %vp diff --git a/test/CodeGen/X86/insertelement-shuffle.ll b/test/CodeGen/X86/insertelement-shuffle.ll index b4377bebb8f..a2b8e2dac86 100644 --- a/test/CodeGen/X86/insertelement-shuffle.ll +++ b/test/CodeGen/X86/insertelement-shuffle.ll @@ -9,7 +9,7 @@ define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounw ; X86: # %bb.0: ; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X86-NEXT: vpinsrw $1, {{[0-9]+}}(%esp), %xmm1, %xmm1 -; X86-NEXT: vpbroadcastd %xmm1, %xmm1 +; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; X86-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X86-NEXT: retl ; @@ -17,7 +17,7 @@ define <8 x float> @insert_subvector_256(i16 %x0, i16 %x1, <8 x float> %v) nounw ; X64: # %bb.0: ; X64-NEXT: vmovd %edi, %xmm1 ; X64-NEXT: vpinsrw $1, %esi, %xmm1, %xmm1 -; X64-NEXT: vpbroadcastd %xmm1, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; X64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] ; X64-NEXT: retq %ins1 = insertelement <2 x i16> undef, i16 %x0, i32 0 diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll index 0a8a0fe9306..d2498f35572 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -824,7 +824,7 @@ define <4 x double> @PR34175(<32 x i16>* %p) { ; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 @@ -848,7 +848,7 @@ define <4 x double> @PR34175(<32 x i16>* %p) { ; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0 @@ -869,7 +869,7 @@ define <4 x double> @PR34175(<32 x i16>* %p) { ; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1 ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] ; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512VBMI-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] ; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0 diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index b5c37263423..d55c1d23a1d 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -656,7 +656,8 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocaptu ; X86-AVX512-LABEL: pinsrd_from_shufflevector_i32: ; X86-AVX512: ## %bb.0: ## %entry ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08] +; X86-AVX512-NEXT: vpermilps $36, (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x08,0x24] +; X86-AVX512-NEXT: ## xmm1 = mem[0,1,2,0] ; X86-AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] ; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] @@ -679,7 +680,8 @@ define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocaptu ; ; X64-AVX512-LABEL: pinsrd_from_shufflevector_i32: ; X64-AVX512: ## %bb.0: ## %entry -; X64-AVX512-NEXT: vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f] +; X64-AVX512-NEXT: vpermilps $36, (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x0f,0x24] +; X64-AVX512-NEXT: ## xmm1 = mem[0,1,2,0] ; X64-AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] ; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] @@ -1224,7 +1226,8 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { ; AVX512-NEXT: vxorps %xmm2, %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x57,0xd2] ; AVX512-NEXT: vblendps $1, %xmm0, %xmm2, %xmm0 ## encoding: [0xc4,0xe3,0x69,0x0c,0xc0,0x01] ; AVX512-NEXT: ## xmm0 = xmm0[0],xmm2[1,2,3] -; AVX512-NEXT: vbroadcastss %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc9] +; AVX512-NEXT: vpermilps $36, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc9,0x24] +; AVX512-NEXT: ## xmm1 = xmm1[0,1,2,0] ; AVX512-NEXT: vblendps $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x08] ; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] diff --git a/test/CodeGen/X86/vector-reduce-add.ll b/test/CodeGen/X86/vector-reduce-add.ll index cd1147aff88..86c8b982cb0 100644 --- a/test/CodeGen/X86/vector-reduce-add.ll +++ b/test/CodeGen/X86/vector-reduce-add.ll @@ -1101,57 +1101,27 @@ define i8 @test_v4i8_load(<4 x i8>* %p) { ; SSE41-NEXT: # kill: def $al killed $al killed $eax ; SSE41-NEXT: retq ; -; AVX1-LABEL: test_v4i8_load: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i8_load: -; AVX2: # %bb.0: -; AVX2-NEXT: movl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: shrl $16, %eax -; AVX2-NEXT: vmovd %eax, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq +; AVX-LABEL: test_v4i8_load: +; AVX: # %bb.0: +; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpextrb $0, %xmm0, %eax +; AVX-NEXT: # kill: def $al killed $al killed $eax +; AVX-NEXT: retq ; -; AVX512BW-LABEL: test_v4i8_load: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movl (%rdi), %eax -; AVX512BW-NEXT: vmovd %eax, %xmm0 -; AVX512BW-NEXT: shrl $16, %eax -; AVX512BW-NEXT: vmovd %eax, %xmm1 -; AVX512BW-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v4i8_load: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: movl (%rdi), %eax -; AVX512VL-NEXT: vmovd %eax, %xmm0 -; AVX512VL-NEXT: shrl $16, %eax -; AVX512VL-NEXT: vpbroadcastw %eax, %xmm1 -; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v4i8_load: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 +; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpextrb $0, %xmm0, %eax +; AVX512-NEXT: # kill: def $al killed $al killed $eax +; AVX512-NEXT: retq %a0 = load <4 x i8>, <4 x i8>* %p %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0) ret i8 %1 diff --git a/test/CodeGen/X86/vector-shuffle-128-v4.ll b/test/CodeGen/X86/vector-shuffle-128-v4.ll index f4a4bb5e6b4..4b012e73f9c 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -354,17 +354,11 @@ define <4 x i32> @shuffle_v4i32_0124(<4 x i32> %a, <4 x i32> %b) { ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: shuffle_v4i32_0124: -; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4i32_0124: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] -; AVX2-NEXT: retq +; AVX1OR2-LABEL: shuffle_v4i32_0124: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] +; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_0124: ; AVX512VL: # %bb.0: @@ -451,19 +445,12 @@ define <4 x i32> @shuffle_v4i32_0412(<4 x i32> %a, <4 x i32> %b) { ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: shuffle_v4i32_0412: -; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuffle_v4i32_0412: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: retq +; AVX1OR2-LABEL: shuffle_v4i32_0412: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX1OR2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,2] +; AVX1OR2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX1OR2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v4i32_0412: ; AVX512VL: # %bb.0: diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index c756fe7d197..95669b32b31 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1571,7 +1571,7 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) { ; ; AVX2-SLOW-LABEL: shuffle_v8i16_XXX1X579: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7] ; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] @@ -1579,7 +1579,7 @@ define <8 x i16> @shuffle_v8i16_XXX1X579(<8 x i16> %a, <8 x i16> %b) { ; ; AVX2-FAST-LABEL: shuffle_v8i16_XXX1X579: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,2,3,8,9,10,11,14,15,14,15] ; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7] ; AVX2-FAST-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll index b74a0b1c0fe..d047042419c 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -309,7 +309,7 @@ define <4 x i32> @test_v16i32_0_1_2_12 (<16 x i32> %v) { ; ALL: # %bb.0: ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1 ; ALL-NEXT: vextractf128 $1, %ymm1, %xmm1 -; ALL-NEXT: vbroadcastss %xmm1, %xmm1 +; ALL-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,1,2,0] ; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] ; ALL-NEXT: vzeroupper ; ALL-NEXT: retq diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index f670bf71a8f..e02dfd1668b 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -864,19 +864,12 @@ define <4 x i32> @combine_nested_undef_test15(<4 x i32> %A, <4 x i32> %B) { ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1-LABEL: combine_nested_undef_test15: -; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX1-NEXT: retq -; -; AVX2-LABEL: combine_nested_undef_test15: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss %xmm1, %xmm1 -; AVX2-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] -; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: retq +; AVX-LABEL: combine_nested_undef_test15: +; AVX: # %bb.0: +; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[0,0,1,1] +; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,1] +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] +; AVX-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> ret <4 x i32> %2