VR128:$mask))]>, XD;
}
+// Non-temporal (unaligned) scalar stores.
+let AddedComplexity = 400 in { // Prefer non-temporal versions
def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
"movntss\t{$src, $dst|$dst, $src}",
[(int_x86_sse4a_movnt_ss addr:$dst, VR128:$src)]>, XS;
def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
"movntsd\t{$src, $dst|$dst, $src}",
[(int_x86_sse4a_movnt_sd addr:$dst, VR128:$src)]>, XD;
-}
+
+def : Pat<(nontemporalstore FR32:$src, addr:$dst),
+ (MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+def : Pat<(nontemporalstore FR64:$src, addr:$dst),
+ (MOVNTSD addr:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+} // AddedComplexity
+} // HasSSE4A
//===----------------------------------------------------------------------===//
// AVX Instructions
(VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
(VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
- }
+ }
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI], AddedComplexity = 20 in {
; Scalar versions.
define void @test_arg_f32(float %arg, float* %dst) {
-; SSE-LABEL: test_arg_f32:
-; SSE: # BB#0:
-; SSE-NEXT: movss %xmm0, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_arg_f32:
+; SSE2: # BB#0:
+; SSE2-NEXT: movss %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_arg_f32:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: movntss %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_arg_f32:
+; SSE41: # BB#0:
+; SSE41-NEXT: movss %xmm0, (%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_arg_f32:
; AVX: # BB#0:
}
define void @test_arg_f64(double %arg, double* %dst) {
-; SSE-LABEL: test_arg_f64:
-; SSE: # BB#0:
-; SSE-NEXT: movsd %xmm0, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_arg_f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_arg_f64:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_arg_f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movsd %xmm0, (%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_arg_f64:
; AVX: # BB#0:
; SSE4A-LABEL: test_extract_f32:
; SSE4A: # BB#0:
; SSE4A-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE4A-NEXT: movss %xmm0, (%rdi)
+; SSE4A-NEXT: movntss %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
; SSE41-LABEL: test_extract_f32:
}
define void @test_extract_f64(<2 x double> %arg, double* %dst) {
-; SSE-LABEL: test_extract_f64:
-; SSE: # BB#0:
-; SSE-NEXT: movhpd %xmm0, (%rdi)
-; SSE-NEXT: retq
+; SSE2-LABEL: test_extract_f64:
+; SSE2: # BB#0:
+; SSE2-NEXT: movhpd %xmm0, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE4A-LABEL: test_extract_f64:
+; SSE4A: # BB#0:
+; SSE4A-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE4A-NEXT: movntsd %xmm0, (%rdi)
+; SSE4A-NEXT: retq
+;
+; SSE41-LABEL: test_extract_f64:
+; SSE41: # BB#0:
+; SSE41-NEXT: movhpd %xmm0, (%rdi)
+; SSE41-NEXT: retq
;
; AVX-LABEL: test_extract_f64:
; AVX: # BB#0: