From 7be78fabc933101ebb81e66792e4707b6bbe27e0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 7 Sep 2019 21:54:40 +0000 Subject: [PATCH] [X86] Add support for unfold broadcast loads from FMA instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@371323 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrFoldTables.cpp | 121 ++++++++++++++++++++ test/CodeGen/X86/avx512-broadcast-unfold.ll | 70 ++++++----- 2 files changed, 161 insertions(+), 30 deletions(-) diff --git a/lib/Target/X86/X86InstrFoldTables.cpp b/lib/Target/X86/X86InstrFoldTables.cpp index 4f98fdf5ea4..63474f9e504 100644 --- a/lib/Target/X86/X86InstrFoldTables.cpp +++ b/lib/Target/X86/X86InstrFoldTables.cpp @@ -5308,6 +5308,117 @@ static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = { { X86::VSUBPSZrr, X86::VSUBPSZrmb, TB_BCAST_SS }, }; +static const X86MemoryFoldTableEntry BroadcastFoldTable3[] = { + { X86::VFMADD132PDZ128r, X86::VFMADD132PDZ128mb, TB_BCAST_SD }, + { X86::VFMADD132PDZ256r, X86::VFMADD132PDZ256mb, TB_BCAST_SD }, + { X86::VFMADD132PDZr, X86::VFMADD132PDZmb, TB_BCAST_SD }, + { X86::VFMADD132PSZ128r, X86::VFMADD132PSZ128mb, TB_BCAST_SS }, + { X86::VFMADD132PSZ256r, X86::VFMADD132PSZ256mb, TB_BCAST_SS }, + { X86::VFMADD132PSZr, X86::VFMADD132PSZmb, TB_BCAST_SS }, + { X86::VFMADD213PDZ128r, X86::VFMADD213PDZ128mb, TB_BCAST_SD }, + { X86::VFMADD213PDZ256r, X86::VFMADD213PDZ256mb, TB_BCAST_SD }, + { X86::VFMADD213PDZr, X86::VFMADD213PDZmb, TB_BCAST_SD }, + { X86::VFMADD213PSZ128r, X86::VFMADD213PSZ128mb, TB_BCAST_SS }, + { X86::VFMADD213PSZ256r, X86::VFMADD213PSZ256mb, TB_BCAST_SS }, + { X86::VFMADD213PSZr, X86::VFMADD213PSZmb, TB_BCAST_SS }, + { X86::VFMADD231PDZ128r, X86::VFMADD231PDZ128mb, TB_BCAST_SD }, + { X86::VFMADD231PDZ256r, X86::VFMADD231PDZ256mb, TB_BCAST_SD }, + { X86::VFMADD231PDZr, X86::VFMADD231PDZmb, TB_BCAST_SD }, + { X86::VFMADD231PSZ128r, X86::VFMADD231PSZ128mb, TB_BCAST_SS }, + { X86::VFMADD231PSZ256r, X86::VFMADD231PSZ256mb, TB_BCAST_SS }, + { X86::VFMADD231PSZr, X86::VFMADD231PSZmb, TB_BCAST_SS }, + { X86::VFMADDSUB132PDZ128r, X86::VFMADDSUB132PDZ128mb, TB_BCAST_SD }, + { X86::VFMADDSUB132PDZ256r, X86::VFMADDSUB132PDZ256mb, TB_BCAST_SD }, + { X86::VFMADDSUB132PDZr, X86::VFMADDSUB132PDZmb, TB_BCAST_SD }, + { X86::VFMADDSUB132PSZ128r, X86::VFMADDSUB132PSZ128mb, TB_BCAST_SS }, + { X86::VFMADDSUB132PSZ256r, X86::VFMADDSUB132PSZ256mb, TB_BCAST_SS }, + { X86::VFMADDSUB132PSZr, X86::VFMADDSUB132PSZmb, TB_BCAST_SS }, + { X86::VFMADDSUB213PDZ128r, X86::VFMADDSUB213PDZ128mb, TB_BCAST_SD }, + { X86::VFMADDSUB213PDZ256r, X86::VFMADDSUB213PDZ256mb, TB_BCAST_SD }, + { X86::VFMADDSUB213PDZr, X86::VFMADDSUB213PDZmb, TB_BCAST_SD }, + { X86::VFMADDSUB213PSZ128r, X86::VFMADDSUB213PSZ128mb, TB_BCAST_SS }, + { X86::VFMADDSUB213PSZ256r, X86::VFMADDSUB213PSZ256mb, TB_BCAST_SS }, + { X86::VFMADDSUB213PSZr, X86::VFMADDSUB213PSZmb, TB_BCAST_SS }, + { X86::VFMADDSUB231PDZ128r, X86::VFMADDSUB231PDZ128mb, TB_BCAST_SD }, + { X86::VFMADDSUB231PDZ256r, X86::VFMADDSUB231PDZ256mb, TB_BCAST_SD }, + { X86::VFMADDSUB231PDZr, X86::VFMADDSUB231PDZmb, TB_BCAST_SD }, + { X86::VFMADDSUB231PSZ128r, X86::VFMADDSUB231PSZ128mb, TB_BCAST_SS }, + { X86::VFMADDSUB231PSZ256r, X86::VFMADDSUB231PSZ256mb, TB_BCAST_SS }, + { X86::VFMADDSUB231PSZr, X86::VFMADDSUB231PSZmb, TB_BCAST_SS }, + { X86::VFMSUB132PDZ128r, X86::VFMSUB132PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUB132PDZ256r, X86::VFMSUB132PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUB132PDZr, X86::VFMSUB132PDZmb, TB_BCAST_SD }, + { X86::VFMSUB132PSZ128r, X86::VFMSUB132PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUB132PSZ256r, X86::VFMSUB132PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUB132PSZr, X86::VFMSUB132PSZmb, TB_BCAST_SS }, + { X86::VFMSUB213PDZ128r, X86::VFMSUB213PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUB213PDZ256r, X86::VFMSUB213PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUB213PDZr, X86::VFMSUB213PDZmb, TB_BCAST_SD }, + { X86::VFMSUB213PSZ128r, X86::VFMSUB213PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUB213PSZ256r, X86::VFMSUB213PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUB213PSZr, X86::VFMSUB213PSZmb, TB_BCAST_SS }, + { X86::VFMSUB231PDZ128r, X86::VFMSUB231PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUB231PDZ256r, X86::VFMSUB231PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUB231PDZr, X86::VFMSUB231PDZmb, TB_BCAST_SD }, + { X86::VFMSUB231PSZ128r, X86::VFMSUB231PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUB231PSZ256r, X86::VFMSUB231PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUB231PSZr, X86::VFMSUB231PSZmb, TB_BCAST_SS }, + { X86::VFMSUBADD132PDZ128r, X86::VFMSUBADD132PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUBADD132PDZ256r, X86::VFMSUBADD132PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUBADD132PDZr, X86::VFMSUBADD132PDZmb, TB_BCAST_SD }, + { X86::VFMSUBADD132PSZ128r, X86::VFMSUBADD132PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUBADD132PSZ256r, X86::VFMSUBADD132PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUBADD132PSZr, X86::VFMSUBADD132PSZmb, TB_BCAST_SS }, + { X86::VFMSUBADD213PDZ128r, X86::VFMSUBADD213PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUBADD213PDZ256r, X86::VFMSUBADD213PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUBADD213PDZr, X86::VFMSUBADD213PDZmb, TB_BCAST_SD }, + { X86::VFMSUBADD213PSZ128r, X86::VFMSUBADD213PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUBADD213PSZ256r, X86::VFMSUBADD213PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUBADD213PSZr, X86::VFMSUBADD213PSZmb, TB_BCAST_SS }, + { X86::VFMSUBADD231PDZ128r, X86::VFMSUBADD231PDZ128mb, TB_BCAST_SD }, + { X86::VFMSUBADD231PDZ256r, X86::VFMSUBADD231PDZ256mb, TB_BCAST_SD }, + { X86::VFMSUBADD231PDZr, X86::VFMSUBADD231PDZmb, TB_BCAST_SD }, + { X86::VFMSUBADD231PSZ128r, X86::VFMSUBADD231PSZ128mb, TB_BCAST_SS }, + { X86::VFMSUBADD231PSZ256r, X86::VFMSUBADD231PSZ256mb, TB_BCAST_SS }, + { X86::VFMSUBADD231PSZr, X86::VFMSUBADD231PSZmb, TB_BCAST_SS }, + { X86::VFNMADD132PDZ128r, X86::VFNMADD132PDZ128mb, TB_BCAST_SD }, + { X86::VFNMADD132PDZ256r, X86::VFNMADD132PDZ256mb, TB_BCAST_SD }, + { X86::VFNMADD132PDZr, X86::VFNMADD132PDZmb, TB_BCAST_SD }, + { X86::VFNMADD132PSZ128r, X86::VFNMADD132PSZ128mb, TB_BCAST_SS }, + { X86::VFNMADD132PSZ256r, X86::VFNMADD132PSZ256mb, TB_BCAST_SS }, + { X86::VFNMADD132PSZr, X86::VFNMADD132PSZmb, TB_BCAST_SS }, + { X86::VFNMADD213PDZ128r, X86::VFNMADD213PDZ128mb, TB_BCAST_SD }, + { X86::VFNMADD213PDZ256r, X86::VFNMADD213PDZ256mb, TB_BCAST_SD }, + { X86::VFNMADD213PDZr, X86::VFNMADD213PDZmb, TB_BCAST_SD }, + { X86::VFNMADD213PSZ128r, X86::VFNMADD213PSZ128mb, TB_BCAST_SS }, + { X86::VFNMADD213PSZ256r, X86::VFNMADD213PSZ256mb, TB_BCAST_SS }, + { X86::VFNMADD213PSZr, X86::VFNMADD213PSZmb, TB_BCAST_SS }, + { X86::VFNMADD231PDZ128r, X86::VFNMADD231PDZ128mb, TB_BCAST_SD }, + { X86::VFNMADD231PDZ256r, X86::VFNMADD231PDZ256mb, TB_BCAST_SD }, + { X86::VFNMADD231PDZr, X86::VFNMADD231PDZmb, TB_BCAST_SD }, + { X86::VFNMADD231PSZ128r, X86::VFNMADD231PSZ128mb, TB_BCAST_SS }, + { X86::VFNMADD231PSZ256r, X86::VFNMADD231PSZ256mb, TB_BCAST_SS }, + { X86::VFNMADD231PSZr, X86::VFNMADD231PSZmb, TB_BCAST_SS }, + { X86::VFNMSUB132PDZ128r, X86::VFNMSUB132PDZ128mb, TB_BCAST_SD }, + { X86::VFNMSUB132PDZ256r, X86::VFNMSUB132PDZ256mb, TB_BCAST_SD }, + { X86::VFNMSUB132PDZr, X86::VFNMSUB132PDZmb, TB_BCAST_SD }, + { X86::VFNMSUB132PSZ128r, X86::VFNMSUB132PSZ128mb, TB_BCAST_SS }, + { X86::VFNMSUB132PSZ256r, X86::VFNMSUB132PSZ256mb, TB_BCAST_SS }, + { X86::VFNMSUB132PSZr, X86::VFNMSUB132PSZmb, TB_BCAST_SS }, + { X86::VFNMSUB213PDZ128r, X86::VFNMSUB213PDZ128mb, TB_BCAST_SD }, + { X86::VFNMSUB213PDZ256r, X86::VFNMSUB213PDZ256mb, TB_BCAST_SD }, + { X86::VFNMSUB213PDZr, X86::VFNMSUB213PDZmb, TB_BCAST_SD }, + { X86::VFNMSUB213PSZ128r, X86::VFNMSUB213PSZ128mb, TB_BCAST_SS }, + { X86::VFNMSUB213PSZ256r, X86::VFNMSUB213PSZ256mb, TB_BCAST_SS }, + { X86::VFNMSUB213PSZr, X86::VFNMSUB213PSZmb, TB_BCAST_SS }, + { X86::VFNMSUB231PDZ128r, X86::VFNMSUB231PDZ128mb, TB_BCAST_SD }, + { X86::VFNMSUB231PDZ256r, X86::VFNMSUB231PDZ256mb, TB_BCAST_SD }, + { X86::VFNMSUB231PDZr, X86::VFNMSUB231PDZmb, TB_BCAST_SD }, + { X86::VFNMSUB231PSZ128r, X86::VFNMSUB231PSZ128mb, TB_BCAST_SS }, + { X86::VFNMSUB231PSZ256r, X86::VFNMSUB231PSZ256mb, TB_BCAST_SS }, + { X86::VFNMSUB231PSZr, X86::VFNMSUB231PSZmb, TB_BCAST_SS }, +}; + static const X86MemoryFoldTableEntry * lookupFoldTableImpl(ArrayRef Table, unsigned RegOp) { #ifndef NDEBUG @@ -5356,6 +5467,12 @@ lookupFoldTableImpl(ArrayRef Table, unsigned RegOp) { std::end(BroadcastFoldTable2)) == std::end(BroadcastFoldTable2) && "BroadcastFoldTable2 is not sorted and unique!"); + assert(std::is_sorted(std::begin(BroadcastFoldTable3), + std::end(BroadcastFoldTable3)) && + std::adjacent_find(std::begin(BroadcastFoldTable3), + std::end(BroadcastFoldTable3)) == + std::end(BroadcastFoldTable3) && + "BroadcastFoldTable3 is not sorted and unique!"); FoldTablesChecked.store(true, std::memory_order_relaxed); } #endif @@ -5429,6 +5546,10 @@ struct X86MemUnfoldTable { // Index 2, folded broadcast addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST); + for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable3) + // Index 2, folded broadcast + addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD | TB_FOLDED_BCAST); + // Sort the memory->reg unfold table. array_pod_sort(Table.begin(), Table.end()); diff --git a/test/CodeGen/X86/avx512-broadcast-unfold.ll b/test/CodeGen/X86/avx512-broadcast-unfold.ll index 9ff33099ad6..0cc697a51ed 100644 --- a/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -1651,12 +1651,13 @@ define void @bcast_unfold_fma213_v4f32(float* %arg) { ; CHECK-LABEL: bcast_unfold_fma213_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB48_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm0 -; CHECK-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm0 * xmm0) + mem -; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vfmadd213ps {{.*#+}} xmm1 = (xmm1 * xmm1) + xmm0 +; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB48_1 ; CHECK-NEXT: # %bb.2: # %bb11 @@ -1685,12 +1686,13 @@ define void @bcast_unfold_fma231_v4f32(float* %arg) { ; CHECK-LABEL: bcast_unfold_fma231_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB49_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm0 -; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm0 -; CHECK-NEXT: vmovups %xmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 +; CHECK-NEXT: vfmadd231ps {{.*#+}} xmm1 = (xmm1 * xmm0) + xmm1 +; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB49_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1719,12 +1721,13 @@ define void @bcast_unfold_fma213_v8f32(float* %arg) { ; CHECK-LABEL: bcast_unfold_fma213_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB50_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm0 -; CHECK-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm0 * ymm0) + mem -; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 +; CHECK-NEXT: vfmadd213ps {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0 +; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB50_1 ; CHECK-NEXT: # %bb.2: # %bb11 @@ -1754,12 +1757,13 @@ define void @bcast_unfold_fma231_v8f32(float* %arg) { ; CHECK-LABEL: bcast_unfold_fma231_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB51_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm0 -; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm0 -; CHECK-NEXT: vmovups %ymm0, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 +; CHECK-NEXT: vfmadd231ps {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1 +; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB51_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1789,12 +1793,13 @@ define void @bcast_unfold_fma213_v16f32(float* %arg) { ; CHECK-LABEL: bcast_unfold_fma213_v16f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB52_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm0 -; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm0 = (zmm0 * zmm0) + mem -; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 +; CHECK-NEXT: vfmadd213ps {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0 +; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB52_1 ; CHECK-NEXT: # %bb.2: # %bb11 @@ -1824,12 +1829,13 @@ define void @bcast_unfold_fma231_v16f32(float* %arg) { ; CHECK-LABEL: bcast_unfold_fma231_v16f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 +; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB53_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm0 -; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm0 -; CHECK-NEXT: vmovups %zmm0, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 +; CHECK-NEXT: vfmadd231ps {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1 +; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB53_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1929,12 +1935,13 @@ define void @bcast_unfold_fma213_v4f64(double* %arg) { ; CHECK-LABEL: bcast_unfold_fma213_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB56_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm0 -; CHECK-NEXT: vfmadd213pd {{.*#+}} ymm0 = (ymm0 * ymm0) + mem -; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vfmadd213pd {{.*#+}} ymm1 = (ymm1 * ymm1) + ymm0 +; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB56_1 ; CHECK-NEXT: # %bb.2: # %bb11 @@ -1964,12 +1971,13 @@ define void @bcast_unfold_fma231_v4f64(double* %arg) { ; CHECK-LABEL: bcast_unfold_fma231_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB57_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm0 -; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm0 = (ymm0 * mem) + ymm0 -; CHECK-NEXT: vmovupd %ymm0, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 +; CHECK-NEXT: vfmadd231pd {{.*#+}} ymm1 = (ymm1 * ymm0) + ymm1 +; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB57_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -1999,12 +2007,13 @@ define void @bcast_unfold_fma213_v8f64(double* %arg) { ; CHECK-LABEL: bcast_unfold_fma213_v8f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB58_1: # %bb2 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm0 -; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm0 = (zmm0 * zmm0) + mem -; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 +; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm1 = (zmm1 * zmm1) + zmm0 +; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB58_1 ; CHECK-NEXT: # %bb.2: # %bb11 @@ -2034,12 +2043,13 @@ define void @bcast_unfold_fma231_v8f64(double* %arg) { ; CHECK-LABEL: bcast_unfold_fma231_v8f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 +; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB59_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm0 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm0 * mem) + zmm0 -; CHECK-NEXT: vmovupd %zmm0, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 +; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm1 * zmm0) + zmm1 +; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB59_1 ; CHECK-NEXT: # %bb.2: # %bb10 -- 2.50.1