From bb8b9b58f2891b4f2b1c21ed89ed2f6b2755c48a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 31 Jan 2019 11:41:10 +0000 Subject: [PATCH] [X86][AVX] Enable AVX1 broadcasts in shuffle combining Enables 32/64-bit scalar load broadcasts on AVX1 targets The extractelement-load.ll regression will be fixed shortly in a followup commit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@352743 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 26 ++++++++++++++----- test/CodeGen/X86/avx-vbroadcast.ll | 3 +-- test/CodeGen/X86/extractelement-load.ll | 3 ++- .../X86/insert-into-constant-vector.ll | 3 +-- test/CodeGen/X86/insert-loaded-scalar.ll | 3 +-- test/CodeGen/X86/insertelement-var-index.ll | 3 +-- test/CodeGen/X86/vector-shuffle-128-v2.ll | 3 +-- test/CodeGen/X86/widened-broadcast.ll | 9 +++---- 8 files changed, 29 insertions(+), 24 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8412c95ce4e..0c400980876 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -31035,15 +31035,27 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, } // Attempt to match against broadcast-from-vector. - // TODO: Add (partial) AVX1 support. - if (Subtarget.hasAVX2() && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) { + // Limit AVX1 to cases where we're loading+broadcasting a scalar element. + if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) + && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) { SmallVector BroadcastMask(NumMaskElts, 0); if (isTargetShuffleEquivalent(Mask, BroadcastMask)) { - if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) - return SDValue(); // Nothing to do! - Res = DAG.getBitcast(MaskVT, V1); - Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); - return DAG.getBitcast(RootVT, Res); + if (V1.getValueType() == MaskVT && + V1.getOpcode() == ISD::SCALAR_TO_VECTOR && + MayFoldLoad(V1.getOperand(0))) { + if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) + return SDValue(); // Nothing to do! + Res = V1.getOperand(0); + Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); + return DAG.getBitcast(RootVT, Res); + } + if (Subtarget.hasAVX2()) { + if (Depth == 1 && Root.getOpcode() == X86ISD::VBROADCAST) + return SDValue(); // Nothing to do! + Res = DAG.getBitcast(MaskVT, V1); + Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); + return DAG.getBitcast(RootVT, Res); + } } } diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll index 157238622c4..ccb054dce43 100644 --- a/test/CodeGen/X86/avx-vbroadcast.ll +++ b/test/CodeGen/X86/avx-vbroadcast.ll @@ -596,8 +596,7 @@ define <2 x i64> @G(i64* %ptr) nounwind uwtable readnone ssp { ; ; X64-LABEL: G: ; X64: ## %bb.0: ## %entry -; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; X64-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; X64-NEXT: retq entry: %q = load i64, i64* %ptr, align 8 diff --git a/test/CodeGen/X86/extractelement-load.ll b/test/CodeGen/X86/extractelement-load.ll index c4a597b90ed..fbebc07aae0 100644 --- a/test/CodeGen/X86/extractelement-load.ll +++ b/test/CodeGen/X86/extractelement-load.ll @@ -98,7 +98,8 @@ define i64 @t4(<2 x double>* %a) { ; ; X64-AVX-LABEL: t4: ; X64-AVX: # %bb.0: -; X64-AVX-NEXT: movq (%rdi), %rax +; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] +; X64-AVX-NEXT: vpextrq $1, %xmm0, %rax ; X64-AVX-NEXT: retq %b = load <2 x double>, <2 x double>* %a, align 16 %c = shufflevector <2 x double> %b, <2 x double> %b, <2 x i32> diff --git a/test/CodeGen/X86/insert-into-constant-vector.ll b/test/CodeGen/X86/insert-into-constant-vector.ll index ea4c4f1a2d5..c90235cecf6 100644 --- a/test/CodeGen/X86/insert-into-constant-vector.ll +++ b/test/CodeGen/X86/insert-into-constant-vector.ll @@ -273,8 +273,7 @@ define <8 x i32> @elt7_v8i32(i32 %x) { ; ; X32AVX1-LABEL: elt7_v8i32: ; X32AVX1: # %bb.0: -; X32AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] +; X32AVX1-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0 ; X32AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X32AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5,6],ymm0[7] ; X32AVX1-NEXT: retl diff --git a/test/CodeGen/X86/insert-loaded-scalar.ll b/test/CodeGen/X86/insert-loaded-scalar.ll index 81cb533f442..66d27788161 100644 --- a/test/CodeGen/X86/insert-loaded-scalar.ll +++ b/test/CodeGen/X86/insert-loaded-scalar.ll @@ -180,8 +180,7 @@ define <2 x i64> @load64_ins_eltc_v2i64(i64* %p) nounwind { ; ; AVX1-LABEL: load64_ins_eltc_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: load64_ins_eltc_v2i64: diff --git a/test/CodeGen/X86/insertelement-var-index.ll b/test/CodeGen/X86/insertelement-var-index.ll index 3288416da54..c7bb6603b70 100644 --- a/test/CodeGen/X86/insertelement-var-index.ll +++ b/test/CodeGen/X86/insertelement-var-index.ll @@ -205,8 +205,7 @@ define <2 x i64> @load_i64_v2i64(i64* %p, i32 %y) nounwind { ; ; AVX1-LABEL: load_i64_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_i64_v2i64: diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll index ffcaad8ee51..fcf00609d8f 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -1266,8 +1266,7 @@ define <2 x i64> @insert_dup_mem_v2i64(i64* %ptr) { ; ; AVX1-LABEL: insert_dup_mem_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_dup_mem_v2i64: diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll index 167128ae002..c192ee6fd28 100644 --- a/test/CodeGen/X86/widened-broadcast.ll +++ b/test/CodeGen/X86/widened-broadcast.ll @@ -582,8 +582,7 @@ define <4 x i32> @load_splat_4i32_2i32_0101(<2 x i32>* %vp) { ; ; AVX1-LABEL: load_splat_4i32_2i32_0101: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_splat_4i32_2i32_0101: @@ -610,8 +609,7 @@ define <8 x i32> @load_splat_8i32_2i32_0101(<2 x i32>* %vp) { ; ; AVX1-LABEL: load_splat_8i32_2i32_0101: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -641,8 +639,7 @@ define <16 x i32> @load_splat_16i32_2i32_0101(<2 x i32>* %vp) { ; ; AVX1-LABEL: load_splat_16i32_2i32_0101: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps %ymm0, %ymm1 ; AVX1-NEXT: retq -- 2.40.0