From: Simon Pilgrim Date: Wed, 27 Feb 2019 11:17:25 +0000 (+0000) Subject: [X86][AVX] Only combine loads to broadcasts for legal types X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=03864785358f74a80924d360e24a9f22478e4577;p=llvm [X86][AVX] Only combine loads to broadcasts for legal types Thanks to @echristo for spotting this. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@354961 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index cb272335e2b..e5b2f05473e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7448,15 +7448,17 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef Elts, if (RepeatSize > ScalarSize) RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT, RepeatSize / ScalarSize); - if (SDValue RepeatLoad = EltsFromConsecutiveLoads( - RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) { - EVT BroadcastVT = - EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(), - VT.getSizeInBits() / ScalarSize); - unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST - : X86ISD::VBROADCAST; - SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad); - return DAG.getBitcast(VT, Broadcast); + EVT BroadcastVT = + EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(), + VT.getSizeInBits() / ScalarSize); + if (TLI.isTypeLegal(BroadcastVT)) { + if (SDValue RepeatLoad = EltsFromConsecutiveLoads( + RepeatVT, RepeatedLoads, DL, DAG, Subtarget, isAfterLegalize)) { + unsigned Opcode = RepeatSize > ScalarSize ? X86ISD::SUBV_BROADCAST + : X86ISD::VBROADCAST; + SDValue Broadcast = DAG.getNode(Opcode, DL, BroadcastVT, RepeatLoad); + return DAG.getBitcast(VT, Broadcast); + } } } } diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll index 84537cb15df..3426d6504c0 100644 --- a/test/CodeGen/X86/avx-vbroadcast.ll +++ b/test/CodeGen/X86/avx-vbroadcast.ll @@ -856,6 +856,32 @@ define <4 x double> @broadcast_shuffle1032(double* %p) { ret <4 x double> %4 } +define void @broadcast_v16i32(i32* %a, <16 x i32>* %b) { +; X32-LABEL: broadcast_v16i32: +; X32: ## %bb.0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vbroadcastss (%ecx), %ymm0 +; X32-NEXT: vmovups %ymm0, 32(%eax) +; X32-NEXT: vmovups %ymm0, (%eax) +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: broadcast_v16i32: +; X64: ## %bb.0: +; X64-NEXT: vbroadcastss (%rdi), %ymm0 +; X64-NEXT: vmovups %ymm0, 32(%rsi) +; X64-NEXT: vmovups %ymm0, (%rsi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq + %1 = load i32, i32* %a, align 4 + %2 = insertelement <8 x i32> undef, i32 %1, i32 0 + %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer + %4 = shufflevector <8 x i32> undef, <8 x i32> %3, <16 x i32> + store <16 x i32> %4, <16 x i32>* %b, align 4 + ret void +} + ; ; When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies. ; diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll index e10dbdaeb4f..e7dd881960b 100644 --- a/test/CodeGen/X86/avx2-vbroadcast.ll +++ b/test/CodeGen/X86/avx2-vbroadcast.ll @@ -1037,6 +1037,48 @@ define <4 x double> @splat_concat4(double %d) { ret <4 x double> %5 } +define void @broadcast_v16i32(i32* %a, <16 x i32>* %b) { +; X32-AVX2-LABEL: broadcast_v16i32: +; X32-AVX2: ## %bb.0: +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX2-NEXT: vbroadcastss (%ecx), %ymm0 +; X32-AVX2-NEXT: vmovups %ymm0, 32(%eax) +; X32-AVX2-NEXT: vmovups %ymm0, (%eax) +; X32-AVX2-NEXT: vzeroupper +; X32-AVX2-NEXT: retl +; +; X64-AVX2-LABEL: broadcast_v16i32: +; X64-AVX2: ## %bb.0: +; X64-AVX2-NEXT: vbroadcastss (%rdi), %ymm0 +; X64-AVX2-NEXT: vmovups %ymm0, 32(%rsi) +; X64-AVX2-NEXT: vmovups %ymm0, (%rsi) +; X64-AVX2-NEXT: vzeroupper +; X64-AVX2-NEXT: retq +; +; X32-AVX512VL-LABEL: broadcast_v16i32: +; X32-AVX512VL: ## %bb.0: +; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-AVX512VL-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-AVX512VL-NEXT: vbroadcastss (%ecx), %zmm0 +; X32-AVX512VL-NEXT: vmovups %zmm0, (%eax) +; X32-AVX512VL-NEXT: vzeroupper +; X32-AVX512VL-NEXT: retl +; +; X64-AVX512VL-LABEL: broadcast_v16i32: +; X64-AVX512VL: ## %bb.0: +; X64-AVX512VL-NEXT: vbroadcastss (%rdi), %zmm0 +; X64-AVX512VL-NEXT: vmovups %zmm0, (%rsi) +; X64-AVX512VL-NEXT: vzeroupper +; X64-AVX512VL-NEXT: retq + %1 = load i32, i32* %a, align 4 + %2 = insertelement <8 x i32> undef, i32 %1, i32 0 + %3 = shufflevector <8 x i32> %2, <8 x i32> undef, <8 x i32> zeroinitializer + %4 = shufflevector <8 x i32> undef, <8 x i32> %3, <16 x i32> + store <16 x i32> %4, <16 x i32>* %b, align 4 + ret void +} + ; Test cases for . ; Instruction selection for broacast instruction fails if ; the load cannot be folded into the broadcast.