From 2a42c3b9a18c5b8ec125da3293b068971cbdedca Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 19 Jan 2017 02:34:29 +0000 Subject: [PATCH] [AVX-512] Use VSHUF instructions instead of two inserts as fallback for subvector broadcasts that can't fold the load. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@292466 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 111 +++++++----------------- test/CodeGen/X86/subvector-broadcast.ll | 18 ++-- 2 files changed, 39 insertions(+), 90 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index d2b9b1f45b1..74e73d4e872 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1092,46 +1092,6 @@ def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), (VBROADCASTI32X4rm addr:$src)>; def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), (VBROADCASTI32X4rm addr:$src)>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))), - (VINSERTF64x4Zrr - (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v8f64 (VINSERTF32x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; -def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))), - (VINSERTI64x4Zrr - (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v8i64 (VINSERTI32x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; - -def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))), - (VINSERTI64x4Zrr - (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v32i16 (VINSERTI32x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; -def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))), - (VINSERTI64x4Zrr - (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v64i8 (VINSERTI32x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; } let Predicates = [HasVLX] in { @@ -1203,25 +1163,6 @@ def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))), def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), (VBROADCASTI32X4rm addr:$src)>; -def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))), - (VINSERTF64x4Zrr - (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; -def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))), - (VINSERTI64x4Zrr - (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; - def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), (VBROADCASTF64X4rm addr:$src)>; def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), @@ -1259,25 +1200,6 @@ def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v8i32 VR256X:$src), 1)>; - -def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))), - (VINSERTF32x8Zrr - (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v16f32 (VINSERTF32x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; -def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))), - (VINSERTI32x8Zrr - (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1), - (EXTRACT_SUBREG - (v16i32 (VINSERTI32x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), - VR128X:$src, sub_xmm), - VR128X:$src, 1)), sub_ymm), 1)>; } multiclass avx512_common_broadcast_32x2 opc, string OpcodeStr, @@ -8471,6 +8393,39 @@ defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>, defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +let Predicates = [HasAVX512] in { +// Provide fallback in case the load node that is used in the broadcast +// patterns above is used by additional users, which prevents the pattern +// selection. +def : Pat<(v8f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VSHUFF64X2Zrri (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; +def : Pat<(v8i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VSHUFI64X2Zrri (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; + +def : Pat<(v16f32 (X86SubVBroadcast (v4f32 VR128X:$src))), + (VSHUFF32X4Zrri (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; +def : Pat<(v16i32 (X86SubVBroadcast (v4i32 VR128X:$src))), + (VSHUFI32X4Zrri (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; + +def : Pat<(v32i16 (X86SubVBroadcast (v8i16 VR128X:$src))), + (VSHUFI32X4Zrri (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; + +def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))), + (VSHUFI32X4Zrri (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + 0)>; +} + multiclass avx512_valign { defm NAME: avx512_common_3Op_imm8, AVX512AIi8Base, EVEX_4V; diff --git a/test/CodeGen/X86/subvector-broadcast.ll b/test/CodeGen/X86/subvector-broadcast.ll index 7aa3f393bbe..5082101a6d4 100644 --- a/test/CodeGen/X86/subvector-broadcast.ll +++ b/test/CodeGen/X86/subvector-broadcast.ll @@ -1225,8 +1225,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* ; X32-AVX512F-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-AVX512F-NEXT: vmovdqa %xmm1, (%eax) -; X32-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 -; X32-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512F-NEXT: retl ; ; X32-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain: @@ -1236,8 +1235,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* ; X32-AVX512BW-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X32-AVX512BW-NEXT: vmovdqa %xmm1, (%eax) -; X32-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 -; X32-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512BW-NEXT: retl ; ; X32-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain: @@ -1247,8 +1245,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* ; X32-AVX512DQ-NEXT: vmovdqa (%ecx), %xmm0 ; X32-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X32-AVX512DQ-NEXT: vmovaps %xmm1, (%eax) -; X32-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 -; X32-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X32-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X32-AVX512DQ-NEXT: retl ; ; X64-AVX-LABEL: test_broadcast_4i32_16i32_chain: @@ -1265,8 +1262,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* ; X64-AVX512F-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512F-NEXT: vmovdqa %xmm1, (%rsi) -; X64-AVX512F-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 -; X64-AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512F-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512F-NEXT: retq ; ; X64-AVX512BW-LABEL: test_broadcast_4i32_16i32_chain: @@ -1274,8 +1270,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* ; X64-AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; X64-AVX512BW-NEXT: vmovdqa %xmm1, (%rsi) -; X64-AVX512BW-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 -; X64-AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512BW-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512BW-NEXT: retq ; ; X64-AVX512DQ-LABEL: test_broadcast_4i32_16i32_chain: @@ -1283,8 +1278,7 @@ define <16 x i32> @test_broadcast_4i32_16i32_chain(<4 x i32>* %p0, <4 x float>* ; X64-AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 ; X64-AVX512DQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; X64-AVX512DQ-NEXT: vmovaps %xmm1, (%rsi) -; X64-AVX512DQ-NEXT: vinserti32x4 $1, %xmm0, %zmm0, %zmm0 -; X64-AVX512DQ-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 +; X64-AVX512DQ-NEXT: vshufi32x4 {{.*#+}} zmm0 = zmm0[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] ; X64-AVX512DQ-NEXT: retq %1 = load <4 x i32>, <4 x i32>* %p0 store <4 x float> zeroinitializer, <4 x float>* %p1 -- 2.40.0