From 6bf1d9e47ccbaba32626e2261860fc225e51c17d Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 21 Jul 2017 00:40:42 +0000 Subject: [PATCH] [AVX-512] Fix a bug that prevented some non-temporal loads from using the movntdqa instruction. The bitconverts here had an input type of 128-bits and an output type of 256 bits. The input type should also have been 256 bits. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@308702 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 6 ++--- test/CodeGen/X86/nontemporal-loads.ll | 39 +++++++-------------------- 2 files changed, 12 insertions(+), 33 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 705d0f7a5cf..ccfe7ce615d 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -4328,11 +4328,11 @@ let Predicates = [HasVLX], AddedComplexity = 400 in { (VMOVNTDQAZ256rm addr:$src)>; def : Pat<(v4i64 (alignednontemporalload addr:$src)), (VMOVNTDQAZ256rm addr:$src)>; - def : Pat<(v8i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), + def : Pat<(v8i32 (bitconvert (v4i64 (alignednontemporalload addr:$src)))), (VMOVNTDQAZ256rm addr:$src)>; - def : Pat<(v16i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), + def : Pat<(v16i16 (bitconvert (v4i64 (alignednontemporalload addr:$src)))), (VMOVNTDQAZ256rm addr:$src)>; - def : Pat<(v32i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), + def : Pat<(v32i8 (bitconvert (v4i64 (alignednontemporalload addr:$src)))), (VMOVNTDQAZ256rm addr:$src)>; def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst), diff --git a/test/CodeGen/X86/nontemporal-loads.ll b/test/CodeGen/X86/nontemporal-loads.ll index 3c916fd38c6..a9e42ad5f7b 100644 --- a/test/CodeGen/X86/nontemporal-loads.ll +++ b/test/CodeGen/X86/nontemporal-loads.ll @@ -211,20 +211,10 @@ define <8 x i32> @test_v8i32(<8 x i32>* %src) { ; AVX2-NEXT: vmovntdqa (%rdi), %ymm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test_v8i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_v8i32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0 -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8i32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovaps (%rdi), %ymm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_v8i32: +; AVX512: # BB#0: +; AVX512-NEXT: vmovntdqa (%rdi), %ymm0 +; AVX512-NEXT: retq %1 = load <8 x i32>, <8 x i32>* %src, align 32, !nontemporal !1 ret <8 x i32> %1 } @@ -876,22 +866,11 @@ define <8 x i32> @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %src) { ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512F-LABEL: test_arg_v8i32: -; AVX512F: # BB#0: -; AVX512F-NEXT: vmovntdqa (%rdi), %ymm1 -; AVX512F-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: test_arg_v8i32: -; AVX512BW: # BB#0: -; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm1 -; AVX512BW-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_arg_v8i32: -; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpaddd (%rdi), %ymm0, %ymm0 -; AVX512VL-NEXT: retq +; AVX512-LABEL: test_arg_v8i32: +; AVX512: # BB#0: +; AVX512-NEXT: vmovntdqa (%rdi), %ymm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: retq %1 = load <8 x i32>, <8 x i32>* %src, align 32, !nontemporal !1 %2 = add <8 x i32> %arg, %1 ret <8 x i32> %2 -- 2.40.0