From ece1ebdf1250ed7fa3a1ee51a9f649005378d8a4 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 14 Jun 2016 09:43:38 +0000 Subject: [PATCH] [X86][SSE4A] Added patterns for nontemporal stores of scalar float/doubles using MOVNTSD/MOVNTSS git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@272651 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrSSE.td | 14 ++++++-- test/CodeGen/X86/nontemporal-2.ll | 57 ++++++++++++++++++++++++------- 2 files changed, 56 insertions(+), 15 deletions(-) diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 661f733a1b9..f589342437d 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7774,6 +7774,8 @@ def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), VR128:$mask))]>, XD; } +// Non-temporal (unaligned) scalar stores. +let AddedComplexity = 400 in { // Prefer non-temporal versions def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), "movntss\t{$src, $dst|$dst, $src}", [(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS; @@ -7781,7 +7783,15 @@ def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src), "movntsd\t{$src, $dst|$dst, $src}", [(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD; -} + +def : Pat<(nontemporalstore FR32:$src, addr:$dst), + (MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; + +def : Pat<(nontemporalstore FR64:$src, addr:$dst), + (MOVNTSD addr:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; + +} // AddedComplexity +} // HasSSE4A //===----------------------------------------------------------------------===// // AVX Instructions @@ -8364,7 +8374,7 @@ let Predicates = [HasAVX2, NoVLX] in { (VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>; def : Pat<(v4f64 (X86VBroadcast FR64:$src)), (VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>; - } + } } let Predicates = [HasAVX2, NoVLX_Or_NoBWI], AddedComplexity = 20 in { diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll index b8fdfe90441..e221f8e9520 100644 --- a/test/CodeGen/X86/nontemporal-2.ll +++ b/test/CodeGen/X86/nontemporal-2.ll @@ -386,10 +386,20 @@ define void @test_zero_v32i8(<32 x i8>* %dst) { ; Scalar versions. define void @test_arg_f32(float %arg, float* %dst) { -; SSE-LABEL: test_arg_f32: -; SSE: # BB#0: -; SSE-NEXT: movss %xmm0, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_arg_f32: +; SSE2: # BB#0: +; SSE2-NEXT: movss %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_arg_f32: +; SSE4A: # BB#0: +; SSE4A-NEXT: movntss %xmm0, (%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_arg_f32: +; SSE41: # BB#0: +; SSE41-NEXT: movss %xmm0, (%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_arg_f32: ; AVX: # BB#0: @@ -424,10 +434,20 @@ define void @test_arg_i32(i32 %arg, i32* %dst) { } define void @test_arg_f64(double %arg, double* %dst) { -; SSE-LABEL: test_arg_f64: -; SSE: # BB#0: -; SSE-NEXT: movsd %xmm0, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_arg_f64: +; SSE2: # BB#0: +; SSE2-NEXT: movsd %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_arg_f64: +; SSE4A: # BB#0: +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_arg_f64: +; SSE41: # BB#0: +; SSE41-NEXT: movsd %xmm0, (%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_arg_f64: ; AVX: # BB#0: @@ -473,7 +493,7 @@ define void @test_extract_f32(<4 x float> %arg, float* %dst) { ; SSE4A-LABEL: test_extract_f32: ; SSE4A: # BB#0: ; SSE4A-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE4A-NEXT: movss %xmm0, (%rdi) +; SSE4A-NEXT: movntss %xmm0, (%rdi) ; SSE4A-NEXT: retq ; ; SSE41-LABEL: test_extract_f32: @@ -536,10 +556,21 @@ define void @test_extract_i32(<4 x i32> %arg, i32* %dst) { } define void @test_extract_f64(<2 x double> %arg, double* %dst) { -; SSE-LABEL: test_extract_f64: -; SSE: # BB#0: -; SSE-NEXT: movhpd %xmm0, (%rdi) -; SSE-NEXT: retq +; SSE2-LABEL: test_extract_f64: +; SSE2: # BB#0: +; SSE2-NEXT: movhpd %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSE4A-LABEL: test_extract_f64: +; SSE4A: # BB#0: +; SSE4A-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; SSE4A-NEXT: movntsd %xmm0, (%rdi) +; SSE4A-NEXT: retq +; +; SSE41-LABEL: test_extract_f64: +; SSE41: # BB#0: +; SSE41-NEXT: movhpd %xmm0, (%rdi) +; SSE41-NEXT: retq ; ; AVX-LABEL: test_extract_f64: ; AVX: # BB#0: -- 2.50.1