From e9d3d132cef34aba5a1e193645f10f82b2f9a197 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 19 Apr 2017 10:52:09 +0000 Subject: [PATCH] [X86][SSE] Add scheduling latency/throughput tests for (most) SSE2 instructions git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@300671 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/sse2-schedule.ll | 6039 +++++++++++++++++++++++++++++ 1 file changed, 6039 insertions(+) create mode 100644 test/CodeGen/X86/sse2-schedule.ll diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll new file mode 100644 index 00000000000..33a4f413b68 --- /dev/null +++ b/test/CodeGen/X86/sse2-schedule.ll @@ -0,0 +1,6039 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=atom | FileCheck %s --check-prefix=CHECK --check-prefix=ATOM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=slm | FileCheck %s --check-prefix=CHECK --check-prefix=SLM +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=sandybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=ivybridge | FileCheck %s --check-prefix=CHECK --check-prefix=SANDY +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2 + +define <2 x double> @test_addpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_addpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: addpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_addpd: +; ATOM: # BB#0: +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: addpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_addpd: +; SLM: # BB#0: +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: addpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_addpd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fadd <2 x double> %a0, %a1 + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = fadd <2 x double> %1, %2 + ret <2 x double> %3 +} + +define double @test_addsd(double %a0, double %a1, double *%a2) { +; GENERIC-LABEL: test_addsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: addsd %xmm1, %xmm0 +; GENERIC-NEXT: addsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_addsd: +; ATOM: # BB#0: +; ATOM-NEXT: addsd %xmm1, %xmm0 +; ATOM-NEXT: addsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_addsd: +; SLM: # BB#0: +; SLM-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: addsd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_addsd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_addsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_addsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fadd double %a0, %a1 + %2 = load double, double *%a2, align 8 + %3 = fadd double %1, %2 + ret double %3 +} + +define <2 x double> @test_andpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_andpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: andpd %xmm1, %xmm0 +; GENERIC-NEXT: andpd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_andpd: +; ATOM: # BB#0: +; ATOM-NEXT: andpd %xmm1, %xmm0 +; ATOM-NEXT: andpd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_andpd: +; SLM: # BB#0: +; SLM-NEXT: andpd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: andpd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_andpd: +; SANDY: # BB#0: +; SANDY-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vandpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast <2 x double> %a0 to <4 x i32> + %2 = bitcast <2 x double> %a1 to <4 x i32> + %3 = and <4 x i32> %1, %2 + %4 = load <2 x double>, <2 x double> *%a2, align 16 + %5 = bitcast <2 x double> %4 to <4 x i32> + %6 = and <4 x i32> %3, %5 + %7 = bitcast <4 x i32> %6 to <2 x double> + %8 = fadd <2 x double> %a1, %7 + ret <2 x double> %8 +} + +define <2 x double> @test_andnotpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_andnotpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: andnpd %xmm1, %xmm0 +; GENERIC-NEXT: andnpd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_andnotpd: +; ATOM: # BB#0: +; ATOM-NEXT: andnpd %xmm1, %xmm0 +; ATOM-NEXT: andnpd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_andnotpd: +; SLM: # BB#0: +; SLM-NEXT: andnpd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: andnpd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_andnotpd: +; SANDY: # BB#0: +; SANDY-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_andnotpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_andnotpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vandnpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vandnpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast <2 x double> %a0 to <4 x i32> + %2 = bitcast <2 x double> %a1 to <4 x i32> + %3 = xor <4 x i32> %1, + %4 = and <4 x i32> %3, %2 + %5 = load <2 x double>, <2 x double> *%a2, align 16 + %6 = bitcast <2 x double> %5 to <4 x i32> + %7 = xor <4 x i32> %4, + %8 = and <4 x i32> %6, %7 + %9 = bitcast <4 x i32> %8 to <2 x double> + %10 = fadd <2 x double> %a1, %9 + ret <2 x double> %10 +} + +define <2 x double> @test_cmppd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_cmppd: +; GENERIC: # BB#0: +; GENERIC-NEXT: cmpeqpd %xmm0, %xmm1 +; GENERIC-NEXT: cmpeqpd (%rdi), %xmm0 +; GENERIC-NEXT: orpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cmppd: +; ATOM: # BB#0: +; ATOM-NEXT: cmpeqpd %xmm0, %xmm1 +; ATOM-NEXT: cmpeqpd (%rdi), %xmm0 +; ATOM-NEXT: orpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cmppd: +; SLM: # BB#0: +; SLM-NEXT: cmpeqpd %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: cmpeqpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: orpd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cmppd: +; SANDY: # BB#0: +; SANDY-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cmppd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; HASWELL-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cmppd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcmpeqpd %xmm1, %xmm0, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vcmpeqpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: vorpd %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fcmp oeq <2 x double> %a0, %a1 + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = fcmp oeq <2 x double> %a0, %2 + %4 = or <2 x i1> %1, %3 + %5 = sext <2 x i1> %4 to <2 x i64> + %6 = bitcast <2 x i64> %5 to <2 x double> + ret <2 x double> %6 +} + +define double @test_cmpsd(double %a0, double %a1, double *%a2) { +; GENERIC-LABEL: test_cmpsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: cmpeqsd %xmm1, %xmm0 +; GENERIC-NEXT: cmpeqsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cmpsd: +; ATOM: # BB#0: +; ATOM-NEXT: cmpeqsd %xmm1, %xmm0 +; ATOM-NEXT: cmpeqsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cmpsd: +; SLM: # BB#0: +; SLM-NEXT: cmpeqsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: cmpeqsd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cmpsd: +; SANDY: # BB#0: +; SANDY-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cmpsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cmpsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcmpeqsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcmpeqsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <2 x double> undef, double %a0, i32 0 + %2 = insertelement <2 x double> undef, double %a1, i32 0 + %3 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %1, <2 x double> %2, i8 0) + %4 = load double, double *%a2, align 8 + %5 = insertelement <2 x double> undef, double %4, i32 0 + %6 = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %3, <2 x double> %5, i8 0) + %7 = extractelement <2 x double> %6, i32 0 + ret double %7 +} +declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone + +define i32 @test_comisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_comisd: +; GENERIC: # BB#0: +; GENERIC-NEXT: comisd %xmm1, %xmm0 +; GENERIC-NEXT: setnp %al +; GENERIC-NEXT: sete %cl +; GENERIC-NEXT: andb %al, %cl +; GENERIC-NEXT: comisd (%rdi), %xmm0 +; GENERIC-NEXT: setnp %al +; GENERIC-NEXT: sete %dl +; GENERIC-NEXT: andb %al, %dl +; GENERIC-NEXT: orb %cl, %dl +; GENERIC-NEXT: movzbl %dl, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_comisd: +; ATOM: # BB#0: +; ATOM-NEXT: comisd %xmm1, %xmm0 +; ATOM-NEXT: setnp %al +; ATOM-NEXT: sete %cl +; ATOM-NEXT: andb %al, %cl +; ATOM-NEXT: comisd (%rdi), %xmm0 +; ATOM-NEXT: setnp %al +; ATOM-NEXT: sete %dl +; ATOM-NEXT: andb %al, %dl +; ATOM-NEXT: orb %cl, %dl +; ATOM-NEXT: movzbl %dl, %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_comisd: +; SLM: # BB#0: +; SLM-NEXT: comisd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: setnp %al # sched: [1:0.50] +; SLM-NEXT: sete %cl # sched: [1:0.50] +; SLM-NEXT: andb %al, %cl # sched: [1:0.50] +; SLM-NEXT: comisd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: setnp %al # sched: [1:0.50] +; SLM-NEXT: sete %dl # sched: [1:0.50] +; SLM-NEXT: andb %al, %dl # sched: [1:0.50] +; SLM-NEXT: orb %cl, %dl # sched: [1:0.50] +; SLM-NEXT: movzbl %dl, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_comisd: +; SANDY: # BB#0: +; SANDY-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: setnp %al # sched: [1:0.33] +; SANDY-NEXT: sete %cl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %cl # sched: [1:0.33] +; SANDY-NEXT: vcomisd (%rdi), %xmm0 # sched: [7:1.00] +; SANDY-NEXT: setnp %al # sched: [1:0.33] +; SANDY-NEXT: sete %dl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %dl # sched: [1:0.33] +; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33] +; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_comisd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: setnp %al # sched: [1:0.50] +; HASWELL-NEXT: sete %cl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25] +; HASWELL-NEXT: vcomisd (%rdi), %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: setnp %al # sched: [1:0.50] +; HASWELL-NEXT: sete %dl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25] +; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25] +; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_comisd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcomisd %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: setnp %al # sched: [1:0.50] +; BTVER2-NEXT: sete %cl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50] +; BTVER2-NEXT: vcomisd (%rdi), %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: setnp %al # sched: [1:0.50] +; BTVER2-NEXT: sete %dl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %dl # sched: [1:0.50] +; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] +; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 8 + %3 = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %2) + %4 = or i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone + +define <2 x double> @test_cvtdq2pd(<4 x i32> %a0, <4 x i32> *%a1) { +; GENERIC-LABEL: test_cvtdq2pd: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtdq2pd %xmm0, %xmm1 +; GENERIC-NEXT: cvtdq2pd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtdq2pd: +; ATOM: # BB#0: +; ATOM-NEXT: cvtdq2pd %xmm0, %xmm1 +; ATOM-NEXT: cvtdq2pd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtdq2pd: +; SLM: # BB#0: +; SLM-NEXT: cvtdq2pd %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvtdq2pd (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtdq2pd: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtdq2pd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtdq2pd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtdq2pd (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtdq2pd %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> + %2 = sitofp <2 x i32> %1 to <2 x double> + %3 = load <4 x i32>, <4 x i32>*%a1, align 16 + %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> + %5 = sitofp <2 x i32> %4 to <2 x double> + %6 = fadd <2 x double> %2, %5 + ret <2 x double> %6 +} + +define <4 x float> @test_cvtdq2ps(<4 x i32> %a0, <4 x i32> *%a1) { +; GENERIC-LABEL: test_cvtdq2ps: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtdq2ps %xmm0, %xmm1 +; GENERIC-NEXT: cvtdq2ps (%rdi), %xmm0 +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtdq2ps: +; ATOM: # BB#0: +; ATOM-NEXT: cvtdq2ps (%rdi), %xmm1 +; ATOM-NEXT: cvtdq2ps %xmm0, %xmm0 +; ATOM-NEXT: addps %xmm0, %xmm1 +; ATOM-NEXT: movaps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtdq2ps: +; SLM: # BB#0: +; SLM-NEXT: cvtdq2ps %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvtdq2ps (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtdq2ps: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtdq2ps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtdq2ps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtdq2ps (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sitofp <4 x i32> %a0 to <4 x float> + %2 = load <4 x i32>, <4 x i32>*%a1, align 16 + %3 = sitofp <4 x i32> %2 to <4 x float> + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} + +define <4 x i32> @test_cvtpd2dq(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_cvtpd2dq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtpd2dq %xmm0, %xmm1 +; GENERIC-NEXT: cvtpd2dq (%rdi), %xmm0 +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtpd2dq: +; ATOM: # BB#0: +; ATOM-NEXT: cvtpd2dq (%rdi), %xmm1 +; ATOM-NEXT: cvtpd2dq %xmm0, %xmm0 +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtpd2dq: +; SLM: # BB#0: +; SLM-NEXT: cvtpd2dq %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvtpd2dq (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtpd2dq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtpd2dq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtpd2dq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtpd2dqx (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtpd2dq %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) + %2 = load <2 x double>, <2 x double> *%a1, align 16 + %3 = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %2) + %4 = add <4 x i32> %1, %3 + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone + +define <4 x float> @test_cvtpd2ps(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_cvtpd2ps: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtpd2ps %xmm0, %xmm1 +; GENERIC-NEXT: cvtpd2ps (%rdi), %xmm0 +; GENERIC-NEXT: addps %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtpd2ps: +; ATOM: # BB#0: +; ATOM-NEXT: cvtpd2ps (%rdi), %xmm1 +; ATOM-NEXT: cvtpd2ps %xmm0, %xmm0 +; ATOM-NEXT: addps %xmm0, %xmm1 +; ATOM-NEXT: movaps %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtpd2ps: +; SLM: # BB#0: +; SLM-NEXT: cvtpd2ps %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvtpd2ps (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: addps %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtpd2ps: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtpd2ps: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtpd2ps: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtpd2psx (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtpd2ps %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) + %2 = load <2 x double>, <2 x double> *%a1, align 16 + %3 = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %2) + %4 = fadd <4 x float> %1, %3 + ret <4 x float> %4 +} +declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone + +define <4 x i32> @test_cvtps2dq(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_cvtps2dq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtps2dq %xmm0, %xmm1 +; GENERIC-NEXT: cvtps2dq (%rdi), %xmm0 +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtps2dq: +; ATOM: # BB#0: +; ATOM-NEXT: cvtps2dq (%rdi), %xmm1 +; ATOM-NEXT: cvtps2dq %xmm0, %xmm0 +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtps2dq: +; SLM: # BB#0: +; SLM-NEXT: cvtps2dq %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvtps2dq (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtps2dq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtps2dq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [7:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtps2dq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtps2dq (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtps2dq %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %2) + %4 = add <4 x i32> %1, %3 + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone + +define <2 x double> @test_cvtps2pd(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_cvtps2pd: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtps2pd %xmm0, %xmm1 +; GENERIC-NEXT: cvtps2pd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtps2pd: +; ATOM: # BB#0: +; ATOM-NEXT: cvtps2pd (%rdi), %xmm1 +; ATOM-NEXT: cvtps2pd %xmm0, %xmm0 +; ATOM-NEXT: addpd %xmm0, %xmm1 +; ATOM-NEXT: movapd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtps2pd: +; SLM: # BB#0: +; SLM-NEXT: cvtps2pd %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvtps2pd (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtps2pd: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtps2pd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtps2pd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtps2pd (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvtps2pd %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x float> %a0, <4 x float> undef, <2 x i32> + %2 = fpext <2 x float> %1 to <2 x double> + %3 = load <4 x float>, <4 x float> *%a1, align 16 + %4 = shufflevector <4 x float> %3, <4 x float> undef, <2 x i32> + %5 = fpext <2 x float> %4 to <2 x double> + %6 = fadd <2 x double> %2, %5 + ret <2 x double> %6 +} + +define i32 @test_cvtsd2si(double %a0, double *%a1) { +; GENERIC-LABEL: test_cvtsd2si: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtsd2si %xmm0, %ecx +; GENERIC-NEXT: cvtsd2si (%rdi), %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtsd2si: +; ATOM: # BB#0: +; ATOM-NEXT: cvtsd2si (%rdi), %eax +; ATOM-NEXT: cvtsd2si %xmm0, %ecx +; ATOM-NEXT: addl %ecx, %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtsd2si: +; SLM: # BB#0: +; SLM-NEXT: cvtsd2si (%rdi), %eax # sched: [7:1.00] +; SLM-NEXT: cvtsd2si %xmm0, %ecx # sched: [4:0.50] +; SLM-NEXT: addl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtsd2si: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00] +; SANDY-NEXT: vcvtsd2si (%rdi), %eax # sched: [7:1.00] +; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtsd2si: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtsd2si %xmm0, %ecx # sched: [4:1.00] +; HASWELL-NEXT: vcvtsd2si (%rdi), %eax # sched: [8:1.00] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtsd2si: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtsd2si (%rdi), %eax # sched: [8:1.00] +; BTVER2-NEXT: vcvtsd2si %xmm0, %ecx # sched: [3:1.00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <2 x double> undef, double %a0, i32 0 + %2 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %1) + %3 = load double, double *%a1, align 8 + %4 = insertelement <2 x double> undef, double %3, i32 0 + %5 = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %4) + %6 = add i32 %2, %5 + ret i32 %6 +} +declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone + +define i64 @test_cvtsd2siq(double %a0, double *%a1) { +; GENERIC-LABEL: test_cvtsd2siq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtsd2si %xmm0, %rcx +; GENERIC-NEXT: cvtsd2si (%rdi), %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtsd2siq: +; ATOM: # BB#0: +; ATOM-NEXT: cvtsd2si (%rdi), %rax +; ATOM-NEXT: cvtsd2si %xmm0, %rcx +; ATOM-NEXT: addq %rcx, %rax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtsd2siq: +; SLM: # BB#0: +; SLM-NEXT: cvtsd2si (%rdi), %rax # sched: [7:1.00] +; SLM-NEXT: cvtsd2si %xmm0, %rcx # sched: [4:0.50] +; SLM-NEXT: addq %rcx, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtsd2siq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00] +; SANDY-NEXT: vcvtsd2si (%rdi), %rax # sched: [7:1.00] +; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtsd2siq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtsd2si %xmm0, %rcx # sched: [4:1.00] +; HASWELL-NEXT: vcvtsd2si (%rdi), %rax # sched: [8:1.00] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtsd2siq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtsd2si (%rdi), %rax # sched: [8:1.00] +; BTVER2-NEXT: vcvtsd2si %xmm0, %rcx # sched: [3:1.00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <2 x double> undef, double %a0, i32 0 + %2 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %1) + %3 = load double, double *%a1, align 8 + %4 = insertelement <2 x double> undef, double %3, i32 0 + %5 = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %4) + %6 = add i64 %2, %5 + ret i64 %6 +} +declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone + +define float @test_cvtsd2ss(double %a0, double *%a1) { +; GENERIC-LABEL: test_cvtsd2ss: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtsd2ss %xmm0, %xmm1 +; GENERIC-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; GENERIC-NEXT: cvtsd2ss %xmm0, %xmm0 +; GENERIC-NEXT: addss %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtsd2ss: +; ATOM: # BB#0: +; ATOM-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; ATOM-NEXT: cvtsd2ss %xmm0, %xmm2 +; ATOM-NEXT: xorps %xmm0, %xmm0 +; ATOM-NEXT: cvtsd2ss %xmm1, %xmm0 +; ATOM-NEXT: addss %xmm2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtsd2ss: +; SLM: # BB#0: +; SLM-NEXT: cvtsd2ss %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero sched: [3:1.00] +; SLM-NEXT: cvtsd2ss %xmm0, %xmm0 # sched: [4:0.50] +; SLM-NEXT: addss %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtsd2ss: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50] +; SANDY-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtsd2ss: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50] +; HASWELL-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [4:1.00] +; HASWELL-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtsd2ss: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero sched: [5:1.00] +; BTVER2-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtsd2ss %xmm1, %xmm1, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fptrunc double %a0 to float + %2 = load double, double *%a1, align 8 + %3 = fptrunc double %2 to float + %4 = fadd float %1, %3 + ret float %4 +} + +define double @test_cvtsi2sd(i32 %a0, i32 *%a1) { +; GENERIC-LABEL: test_cvtsi2sd: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtsi2sdl %edi, %xmm1 +; GENERIC-NEXT: cvtsi2sdl (%rsi), %xmm0 +; GENERIC-NEXT: addsd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtsi2sd: +; ATOM: # BB#0: +; ATOM-NEXT: cvtsi2sdl (%rsi), %xmm0 +; ATOM-NEXT: cvtsi2sdl %edi, %xmm1 +; ATOM-NEXT: addsd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtsi2sd: +; SLM: # BB#0: +; SLM-NEXT: cvtsi2sdl (%rsi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: cvtsi2sdl %edi, %xmm1 # sched: [4:0.50] +; SLM-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtsi2sd: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtsi2sd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtsi2sd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtsi2sdl %edi, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtsi2sdl (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sitofp i32 %a0 to double + %2 = load i32, i32 *%a1, align 8 + %3 = sitofp i32 %2 to double + %4 = fadd double %1, %3 + ret double %4 +} + +define double @test_cvtsi2sdq(i64 %a0, i64 *%a1) { +; GENERIC-LABEL: test_cvtsi2sdq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtsi2sdq %rdi, %xmm1 +; GENERIC-NEXT: cvtsi2sdq (%rsi), %xmm0 +; GENERIC-NEXT: addsd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtsi2sdq: +; ATOM: # BB#0: +; ATOM-NEXT: cvtsi2sdq (%rsi), %xmm0 +; ATOM-NEXT: cvtsi2sdq %rdi, %xmm1 +; ATOM-NEXT: addsd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtsi2sdq: +; SLM: # BB#0: +; SLM-NEXT: cvtsi2sdq (%rsi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: cvtsi2sdq %rdi, %xmm1 # sched: [4:0.50] +; SLM-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtsi2sdq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00] +; SANDY-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtsi2sdq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtsi2sdq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvtsi2sdq %rdi, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtsi2sdq (%rsi), %xmm1, %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sitofp i64 %a0 to double + %2 = load i64, i64 *%a1, align 8 + %3 = sitofp i64 %2 to double + %4 = fadd double %1, %3 + ret double %4 +} + +; TODO - cvtss2sd_m + +define double @test_cvtss2sd(float %a0, float *%a1) { +; GENERIC-LABEL: test_cvtss2sd: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvtss2sd %xmm0, %xmm1 +; GENERIC-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; GENERIC-NEXT: cvtss2sd %xmm0, %xmm0 +; GENERIC-NEXT: addsd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvtss2sd: +; ATOM: # BB#0: +; ATOM-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ATOM-NEXT: cvtss2sd %xmm0, %xmm2 +; ATOM-NEXT: xorps %xmm0, %xmm0 +; ATOM-NEXT: cvtss2sd %xmm1, %xmm0 +; ATOM-NEXT: addsd %xmm2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvtss2sd: +; SLM: # BB#0: +; SLM-NEXT: cvtss2sd %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [3:1.00] +; SLM-NEXT: cvtss2sd %xmm0, %xmm0 # sched: [4:0.50] +; SLM-NEXT: addsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvtss2sd: +; SANDY: # BB#0: +; SANDY-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00] +; SANDY-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvtss2sd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [2:1.00] +; HASWELL-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvtss2sd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 # sched: [3:1.00] +; BTVER2-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fpext float %a0 to double + %2 = load float, float *%a1, align 4 + %3 = fpext float %2 to double + %4 = fadd double %1, %3 + ret double %4 +} + +define <4 x i32> @test_cvttpd2dq(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_cvttpd2dq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvttpd2dq %xmm0, %xmm1 +; GENERIC-NEXT: cvttpd2dq (%rdi), %xmm0 +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvttpd2dq: +; ATOM: # BB#0: +; ATOM-NEXT: cvttpd2dq (%rdi), %xmm1 +; ATOM-NEXT: cvttpd2dq %xmm0, %xmm0 +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvttpd2dq: +; SLM: # BB#0: +; SLM-NEXT: cvttpd2dq %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvttpd2dq (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvttpd2dq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvttpd2dq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [4:1.00] +; HASWELL-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvttpd2dq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttpd2dqx (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvttpd2dq %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fptosi <2 x double> %a0 to <2 x i32> + %2 = shufflevector <2 x i32> %1, <2 x i32> undef, <4 x i32> + %3 = load <2 x double>, <2 x double> *%a1, align 16 + %4 = fptosi <2 x double> %3 to <2 x i32> + %5 = shufflevector <2 x i32> %4, <2 x i32> undef, <4 x i32> + %6 = add <4 x i32> %2, %5 + ret <4 x i32> %6 +} + +define <4 x i32> @test_cvttps2dq(<4 x float> %a0, <4 x float> *%a1) { +; GENERIC-LABEL: test_cvttps2dq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvttps2dq %xmm0, %xmm1 +; GENERIC-NEXT: cvttps2dq (%rdi), %xmm0 +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvttps2dq: +; ATOM: # BB#0: +; ATOM-NEXT: cvttps2dq (%rdi), %xmm1 +; ATOM-NEXT: cvttps2dq %xmm0, %xmm0 +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvttps2dq: +; SLM: # BB#0: +; SLM-NEXT: cvttps2dq %xmm0, %xmm1 # sched: [4:0.50] +; SLM-NEXT: cvttps2dq (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvttps2dq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvttps2dq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [7:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvttps2dq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttps2dq (%rdi), %xmm1 # sched: [8:1.00] +; BTVER2-NEXT: vcvttps2dq %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fptosi <4 x float> %a0 to <4 x i32> + %2 = load <4 x float>, <4 x float> *%a1, align 16 + %3 = fptosi <4 x float> %2 to <4 x i32> + %4 = add <4 x i32> %1, %3 + ret <4 x i32> %4 +} + +define i32 @test_cvttsd2si(double %a0, double *%a1) { +; GENERIC-LABEL: test_cvttsd2si: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvttsd2si %xmm0, %ecx +; GENERIC-NEXT: cvttsd2si (%rdi), %eax +; GENERIC-NEXT: addl %ecx, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvttsd2si: +; ATOM: # BB#0: +; ATOM-NEXT: cvttsd2si (%rdi), %eax +; ATOM-NEXT: cvttsd2si %xmm0, %ecx +; ATOM-NEXT: addl %ecx, %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvttsd2si: +; SLM: # BB#0: +; SLM-NEXT: cvttsd2si (%rdi), %eax # sched: [7:1.00] +; SLM-NEXT: cvttsd2si %xmm0, %ecx # sched: [4:0.50] +; SLM-NEXT: addl %ecx, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvttsd2si: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00] +; SANDY-NEXT: vcvttsd2si (%rdi), %eax # sched: [7:1.00] +; SANDY-NEXT: addl %ecx, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvttsd2si: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttsd2si %xmm0, %ecx # sched: [4:1.00] +; HASWELL-NEXT: vcvttsd2si (%rdi), %eax # sched: [8:1.00] +; HASWELL-NEXT: addl %ecx, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvttsd2si: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttsd2si (%rdi), %eax # sched: [8:1.00] +; BTVER2-NEXT: vcvttsd2si %xmm0, %ecx # sched: [3:1.00] +; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fptosi double %a0 to i32 + %2 = load double, double *%a1, align 8 + %3 = fptosi double %2 to i32 + %4 = add i32 %1, %3 + ret i32 %4 +} + +define i64 @test_cvttsd2siq(double %a0, double *%a1) { +; GENERIC-LABEL: test_cvttsd2siq: +; GENERIC: # BB#0: +; GENERIC-NEXT: cvttsd2si %xmm0, %rcx +; GENERIC-NEXT: cvttsd2si (%rdi), %rax +; GENERIC-NEXT: addq %rcx, %rax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_cvttsd2siq: +; ATOM: # BB#0: +; ATOM-NEXT: cvttsd2si (%rdi), %rax +; ATOM-NEXT: cvttsd2si %xmm0, %rcx +; ATOM-NEXT: addq %rcx, %rax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_cvttsd2siq: +; SLM: # BB#0: +; SLM-NEXT: cvttsd2si (%rdi), %rax # sched: [7:1.00] +; SLM-NEXT: cvttsd2si %xmm0, %rcx # sched: [4:0.50] +; SLM-NEXT: addq %rcx, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_cvttsd2siq: +; SANDY: # BB#0: +; SANDY-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00] +; SANDY-NEXT: vcvttsd2si (%rdi), %rax # sched: [7:1.00] +; SANDY-NEXT: addq %rcx, %rax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_cvttsd2siq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vcvttsd2si %xmm0, %rcx # sched: [4:1.00] +; HASWELL-NEXT: vcvttsd2si (%rdi), %rax # sched: [8:1.00] +; HASWELL-NEXT: addq %rcx, %rax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_cvttsd2siq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vcvttsd2si (%rdi), %rax # sched: [8:1.00] +; BTVER2-NEXT: vcvttsd2si %xmm0, %rcx # sched: [3:1.00] +; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fptosi double %a0 to i64 + %2 = load double, double *%a1, align 8 + %3 = fptosi double %2 to i64 + %4 = add i64 %1, %3 + ret i64 %4 +} + +define <2 x double> @test_divpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_divpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: divpd %xmm1, %xmm0 +; GENERIC-NEXT: divpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_divpd: +; ATOM: # BB#0: +; ATOM-NEXT: divpd %xmm1, %xmm0 +; ATOM-NEXT: divpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_divpd: +; SLM: # BB#0: +; SLM-NEXT: divpd %xmm1, %xmm0 # sched: [34:34.00] +; SLM-NEXT: divpd (%rdi), %xmm0 # sched: [37:34.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_divpd: +; SANDY: # BB#0: +; SANDY-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00] +; SANDY-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_divpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [12:1.00] +; HASWELL-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [16:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_divpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdivpd %xmm1, %xmm0, %xmm0 # sched: [19:19.00] +; BTVER2-NEXT: vdivpd (%rdi), %xmm0, %xmm0 # sched: [24:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fdiv <2 x double> %a0, %a1 + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = fdiv <2 x double> %1, %2 + ret <2 x double> %3 +} + +define double @test_divsd(double %a0, double %a1, double *%a2) { +; GENERIC-LABEL: test_divsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: divsd %xmm1, %xmm0 +; GENERIC-NEXT: divsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_divsd: +; ATOM: # BB#0: +; ATOM-NEXT: divsd %xmm1, %xmm0 +; ATOM-NEXT: divsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_divsd: +; SLM: # BB#0: +; SLM-NEXT: divsd %xmm1, %xmm0 # sched: [34:34.00] +; SLM-NEXT: divsd (%rdi), %xmm0 # sched: [37:34.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_divsd: +; SANDY: # BB#0: +; SANDY-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00] +; SANDY-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_divsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [12:1.00] +; HASWELL-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [16:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_divsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vdivsd %xmm1, %xmm0, %xmm0 # sched: [19:19.00] +; BTVER2-NEXT: vdivsd (%rdi), %xmm0, %xmm0 # sched: [24:19.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fdiv double %a0, %a1 + %2 = load double, double *%a2, align 8 + %3 = fdiv double %1, %2 + ret double %3 +} + +define void @test_lfence() { +; GENERIC-LABEL: test_lfence: +; GENERIC: # BB#0: +; GENERIC-NEXT: lfence +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_lfence: +; ATOM: # BB#0: +; ATOM-NEXT: lfence +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_lfence: +; SLM: # BB#0: +; SLM-NEXT: lfence # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_lfence: +; SANDY: # BB#0: +; SANDY-NEXT: lfence # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_lfence: +; HASWELL: # BB#0: +; HASWELL-NEXT: lfence # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_lfence: +; BTVER2: # BB#0: +; BTVER2-NEXT: lfence # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + call void @llvm.x86.sse2.lfence() + ret void +} +declare void @llvm.x86.sse2.lfence() nounwind readnone + +define void @test_mfence() { +; GENERIC-LABEL: test_mfence: +; GENERIC: # BB#0: +; GENERIC-NEXT: mfence +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_mfence: +; ATOM: # BB#0: +; ATOM-NEXT: mfence +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_mfence: +; SLM: # BB#0: +; SLM-NEXT: mfence # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_mfence: +; SANDY: # BB#0: +; SANDY-NEXT: mfence # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mfence: +; HASWELL: # BB#0: +; HASWELL-NEXT: mfence # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mfence: +; BTVER2: # BB#0: +; BTVER2-NEXT: mfence # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + call void @llvm.x86.sse2.mfence() + ret void +} +declare void @llvm.x86.sse2.mfence() nounwind readnone + +define void @test_maskmovdqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) { +; GENERIC-LABEL: test_maskmovdqu: +; GENERIC: # BB#0: +; GENERIC-NEXT: maskmovdqu %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_maskmovdqu: +; ATOM: # BB#0: +; ATOM-NEXT: maskmovdqu %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_maskmovdqu: +; SLM: # BB#0: +; SLM-NEXT: maskmovdqu %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_maskmovdqu: +; SANDY: # BB#0: +; SANDY-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maskmovdqu: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [14:2.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maskmovdqu: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaskmovdqu %xmm1, %xmm0 # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + call void @llvm.x86.sse2.maskmov.dqu(<16 x i8> %a0, <16 x i8> %a1, i8* %a2) + ret void +} +declare void @llvm.x86.sse2.maskmov.dqu(<16 x i8>, <16 x i8>, i8*) nounwind + +define <2 x double> @test_maxpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_maxpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: maxpd %xmm1, %xmm0 +; GENERIC-NEXT: maxpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_maxpd: +; ATOM: # BB#0: +; ATOM-NEXT: maxpd %xmm1, %xmm0 +; ATOM-NEXT: maxpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_maxpd: +; SLM: # BB#0: +; SLM-NEXT: maxpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: maxpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_maxpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maxpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maxpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaxpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmaxpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %1, <2 x double> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone + +define <2 x double> @test_maxsd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_maxsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: maxsd %xmm1, %xmm0 +; GENERIC-NEXT: maxsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_maxsd: +; ATOM: # BB#0: +; ATOM-NEXT: maxsd %xmm1, %xmm0 +; ATOM-NEXT: maxsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_maxsd: +; SLM: # BB#0: +; SLM-NEXT: maxsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: maxsd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_maxsd: +; SANDY: # BB#0: +; SANDY-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_maxsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_maxsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %1, <2 x double> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone + +define <2 x double> @test_minpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_minpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: minpd %xmm1, %xmm0 +; GENERIC-NEXT: minpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_minpd: +; ATOM: # BB#0: +; ATOM-NEXT: minpd %xmm1, %xmm0 +; ATOM-NEXT: minpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_minpd: +; SLM: # BB#0: +; SLM-NEXT: minpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: minpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_minpd: +; SANDY: # BB#0: +; SANDY-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_minpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_minpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vminpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vminpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %1, <2 x double> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone + +define <2 x double> @test_minsd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_minsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: minsd %xmm1, %xmm0 +; GENERIC-NEXT: minsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_minsd: +; ATOM: # BB#0: +; ATOM-NEXT: minsd %xmm1, %xmm0 +; ATOM-NEXT: minsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_minsd: +; SLM: # BB#0: +; SLM-NEXT: minsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: minsd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_minsd: +; SANDY: # BB#0: +; SANDY-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_minsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_minsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vminsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vminsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %1, <2 x double> %2) + ret <2 x double> %3 +} +declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone + +define void @test_movapd(<2 x double> *%a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_movapd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movapd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm0, %xmm0 +; GENERIC-NEXT: movapd %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movapd: +; ATOM: # BB#0: +; ATOM-NEXT: movapd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm0, %xmm0 +; ATOM-NEXT: movapd %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movapd: +; SLM: # BB#0: +; SLM-NEXT: movapd (%rdi), %xmm0 # sched: [3:1.00] +; SLM-NEXT: addpd %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: movapd %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movapd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movapd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovapd (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movapd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovapd (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovapd %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load <2 x double>, <2 x double> *%a0, align 16 + %2 = fadd <2 x double> %1, %1 + store <2 x double> %2, <2 x double> *%a1, align 16 + ret void +} + +define void @test_movdqa(<2 x i64> *%a0, <2 x i64> *%a1) { +; GENERIC-LABEL: test_movdqa: +; GENERIC: # BB#0: +; GENERIC-NEXT: movdqa (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm0, %xmm0 +; GENERIC-NEXT: movdqa %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movdqa: +; ATOM: # BB#0: +; ATOM-NEXT: movdqa (%rdi), %xmm0 +; ATOM-NEXT: paddq %xmm0, %xmm0 +; ATOM-NEXT: movdqa %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movdqa: +; SLM: # BB#0: +; SLM-NEXT: movdqa (%rdi), %xmm0 # sched: [3:1.00] +; SLM-NEXT: paddq %xmm0, %xmm0 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movdqa: +; SANDY: # BB#0: +; SANDY-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movdqa: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovdqa (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movdqa: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovdqa (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vmovdqa %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load <2 x i64>, <2 x i64> *%a0, align 16 + %2 = add <2 x i64> %1, %1 + store <2 x i64> %2, <2 x i64> *%a1, align 16 + ret void +} + +define void @test_movdqu(<2 x i64> *%a0, <2 x i64> *%a1) { +; GENERIC-LABEL: test_movdqu: +; GENERIC: # BB#0: +; GENERIC-NEXT: movdqu (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm0, %xmm0 +; GENERIC-NEXT: movdqu %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movdqu: +; ATOM: # BB#0: +; ATOM-NEXT: movdqu (%rdi), %xmm0 +; ATOM-NEXT: paddq %xmm0, %xmm0 +; ATOM-NEXT: movdqu %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movdqu: +; SLM: # BB#0: +; SLM-NEXT: movdqu (%rdi), %xmm0 # sched: [3:1.00] +; SLM-NEXT: paddq %xmm0, %xmm0 # sched: [1:0.50] +; SLM-NEXT: movdqu %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movdqu: +; SANDY: # BB#0: +; SANDY-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movdqu: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovdqu (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movdqu: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovdqu (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vmovdqu %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load <2 x i64>, <2 x i64> *%a0, align 1 + %2 = add <2 x i64> %1, %1 + store <2 x i64> %2, <2 x i64> *%a1, align 1 + ret void +} + +define i32 @test_movd(<4 x i32> %a0, i32 %a1, i32 *%a2) { +; GENERIC-LABEL: test_movd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movd %edi, %xmm1 +; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; GENERIC-NEXT: paddd %xmm0, %xmm1 +; GENERIC-NEXT: paddd %xmm0, %xmm2 +; GENERIC-NEXT: movd %xmm2, %eax +; GENERIC-NEXT: movd %xmm1, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movd: +; ATOM: # BB#0: +; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movd %xmm1, %eax +; ATOM-NEXT: movd %edi, %xmm1 +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movd %xmm1, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movd: +; SLM: # BB#0: +; SLM-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [3:1.00] +; SLM-NEXT: movd %edi, %xmm1 # sched: [1:0.50] +; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movd %xmm1, (%rsi) # sched: [1:1.00] +; SLM-NEXT: paddd %xmm0, %xmm2 # sched: [1:0.50] +; SLM-NEXT: movd %xmm2, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovd %edi, %xmm1 # sched: [1:0.33] +; SANDY-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vmovd %xmm0, %eax # sched: [1:0.33] +; SANDY-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovd %edi, %xmm1 # sched: [1:1.00] +; HASWELL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [4:0.50] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vmovd %xmm0, %eax # sched: [1:1.00] +; HASWELL-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero sched: [5:1.00] +; BTVER2-NEXT: vmovd %edi, %xmm1 # sched: [1:0.17] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vmovd %xmm1, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vmovd %xmm0, %eax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <4 x i32> undef, i32 %a1, i32 0 + %2 = load i32, i32 *%a2 + %3 = insertelement <4 x i32> undef, i32 %2, i32 0 + %4 = add <4 x i32> %a0, %1 + %5 = add <4 x i32> %a0, %3 + %6 = extractelement <4 x i32> %4, i32 0 + %7 = extractelement <4 x i32> %5, i32 0 + store i32 %6, i32* %a2 + ret i32 %7 +} + +define i64 @test_movd_64(<2 x i64> %a0, i64 %a1, i64 *%a2) { +; GENERIC-LABEL: test_movd_64: +; GENERIC: # BB#0: +; GENERIC-NEXT: movd %rdi, %xmm1 +; GENERIC-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; GENERIC-NEXT: paddq %xmm0, %xmm1 +; GENERIC-NEXT: paddq %xmm0, %xmm2 +; GENERIC-NEXT: movd %xmm2, %rax +; GENERIC-NEXT: movq %xmm1, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movd_64: +; ATOM: # BB#0: +; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; ATOM-NEXT: movd %rdi, %xmm2 +; ATOM-NEXT: paddq %xmm0, %xmm2 +; ATOM-NEXT: paddq %xmm0, %xmm1 +; ATOM-NEXT: movq %xmm2, (%rsi) +; ATOM-NEXT: movd %xmm1, %rax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movd_64: +; SLM: # BB#0: +; SLM-NEXT: movq {{.*#+}} xmm2 = mem[0],zero sched: [3:1.00] +; SLM-NEXT: movd %rdi, %xmm1 # sched: [1:0.50] +; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movq %xmm1, (%rsi) # sched: [1:1.00] +; SLM-NEXT: paddq %xmm0, %xmm2 # sched: [1:0.50] +; SLM-NEXT: movd %xmm2, %rax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movd_64: +; SANDY: # BB#0: +; SANDY-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.33] +; SANDY-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vmovq %xmm0, %rax # sched: [1:0.33] +; SANDY-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movd_64: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovq %rdi, %xmm1 # sched: [1:1.00] +; HASWELL-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [4:0.50] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vmovq %xmm0, %rax # sched: [1:1.00] +; HASWELL-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movd_64: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero sched: [5:1.00] +; BTVER2-NEXT: vmovq %rdi, %xmm1 # sched: [1:0.17] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vmovq %xmm1, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vmovq %xmm0, %rax # sched: [1:0.17] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = insertelement <2 x i64> undef, i64 %a1, i64 0 + %2 = load i64, i64 *%a2 + %3 = insertelement <2 x i64> undef, i64 %2, i64 0 + %4 = add <2 x i64> %a0, %1 + %5 = add <2 x i64> %a0, %3 + %6 = extractelement <2 x i64> %4, i64 0 + %7 = extractelement <2 x i64> %5, i64 0 + store i64 %6, i64* %a2 + ret i64 %7 +} + +define void @test_movhpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) { +; GENERIC-LABEL: test_movhpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; GENERIC-NEXT: addpd %xmm0, %xmm1 +; GENERIC-NEXT: movhpd %xmm1, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movhpd: +; ATOM: # BB#0: +; ATOM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; ATOM-NEXT: addpd %xmm0, %xmm1 +; ATOM-NEXT: movhpd %xmm1, (%rdi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movhpd: +; SLM: # BB#0: +; SLM-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00] +; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movhpd %xmm1, (%rdi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movhpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movhpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movhpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovhpd %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast x86_mmx* %a2 to double* + %2 = load double, double *%1, align 8 + %3 = insertelement <2 x double> %a1, double %2, i32 1 + %4 = fadd <2 x double> %a0, %3 + %5 = extractelement <2 x double> %4, i32 1 + store double %5, double* %1 + ret void +} + +define void @test_movlpd(<2 x double> %a0, <2 x double> %a1, x86_mmx *%a2) { +; GENERIC-LABEL: test_movlpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; GENERIC-NEXT: addpd %xmm0, %xmm1 +; GENERIC-NEXT: movlpd %xmm1, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movlpd: +; ATOM: # BB#0: +; ATOM-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] +; ATOM-NEXT: addpd %xmm0, %xmm1 +; ATOM-NEXT: movlpd %xmm1, (%rdi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movlpd: +; SLM: # BB#0: +; SLM-NEXT: movlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [4:1.00] +; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movlpd %xmm1, (%rdi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movlpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movlpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movlpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovlpd %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast x86_mmx* %a2 to double* + %2 = load double, double *%1, align 8 + %3 = insertelement <2 x double> %a1, double %2, i32 0 + %4 = fadd <2 x double> %a0, %3 + %5 = extractelement <2 x double> %4, i32 0 + store double %5, double* %1 + ret void +} + +define i32 @test_movmskpd(<2 x double> %a0) { +; GENERIC-LABEL: test_movmskpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movmskpd %xmm0, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movmskpd: +; ATOM: # BB#0: +; ATOM-NEXT: movmskpd %xmm0, %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movmskpd: +; SLM: # BB#0: +; SLM-NEXT: movmskpd %xmm0, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movmskpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movmskpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovmskpd %xmm0, %eax # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movmskpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovmskpd %xmm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) + ret i32 %1 +} +declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone + +define void @test_movntdqa(<2 x i64> %a0, <2 x i64> *%a1) { +; GENERIC-LABEL: test_movntdqa: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddq %xmm0, %xmm0 +; GENERIC-NEXT: movntdq %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movntdqa: +; ATOM: # BB#0: +; ATOM-NEXT: paddq %xmm0, %xmm0 +; ATOM-NEXT: movntdq %xmm0, (%rdi) +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movntdqa: +; SLM: # BB#0: +; SLM-NEXT: paddq %xmm0, %xmm0 # sched: [1:0.50] +; SLM-NEXT: movntdq %xmm0, (%rdi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movntdqa: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movntdqa: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movntdqa: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddq %xmm0, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vmovntdq %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = add <2 x i64> %a0, %a0 + store <2 x i64> %1, <2 x i64> *%a1, align 16, !nontemporal !0 + ret void +} + +define void @test_movntpd(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_movntpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: addpd %xmm0, %xmm0 +; GENERIC-NEXT: movntpd %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movntpd: +; ATOM: # BB#0: +; ATOM-NEXT: addpd %xmm0, %xmm0 +; ATOM-NEXT: movntpd %xmm0, (%rdi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movntpd: +; SLM: # BB#0: +; SLM-NEXT: addpd %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: movntpd %xmm0, (%rdi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movntpd: +; SANDY: # BB#0: +; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movntpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movntpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovntpd %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fadd <2 x double> %a0, %a0 + store <2 x double> %1, <2 x double> *%a1, align 16, !nontemporal !0 + ret void +} + +define <2 x i64> @test_movq_mem(<2 x i64> %a0, i64 *%a1) { +; GENERIC-LABEL: test_movq_mem: +; GENERIC: # BB#0: +; GENERIC-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: movq %xmm0, (%rdi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movq_mem: +; ATOM: # BB#0: +; ATOM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: movq %xmm0, (%rdi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movq_mem: +; SLM: # BB#0: +; SLM-NEXT: movq {{.*#+}} xmm1 = mem[0],zero sched: [3:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: movq %xmm0, (%rdi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movq_mem: +; SANDY: # BB#0: +; SANDY-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movq_mem: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [4:0.50] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movq_mem: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero sched: [5:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vmovq %xmm0, (%rdi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load i64, i64* %a1, align 1 + %2 = insertelement <2 x i64> zeroinitializer, i64 %1, i32 0 + %3 = add <2 x i64> %a0, %2 + %4 = extractelement <2 x i64> %3, i32 0 + store i64 %4, i64 *%a1, align 1 + ret <2 x i64> %3 +} + +define <2 x i64> @test_movq_reg(<2 x i64> %a0, <2 x i64> %a1) { +; GENERIC-LABEL: test_movq_reg: +; GENERIC: # BB#0: +; GENERIC-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movq_reg: +; ATOM: # BB#0: +; ATOM-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movq_reg: +; SLM: # BB#0: +; SLM-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movq_reg: +; SANDY: # BB#0: +; SANDY-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33] +; SANDY-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movq_reg: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.33] +; HASWELL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movq_reg: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero sched: [1:0.50] +; BTVER2-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> + %2 = add <2 x i64> %a1, %1 + ret <2 x i64> %2 +} + +define void @test_movsd_mem(double* %a0, double* %a1) { +; GENERIC-LABEL: test_movsd_mem: +; GENERIC: # BB#0: +; GENERIC-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; GENERIC-NEXT: addsd %xmm0, %xmm0 +; GENERIC-NEXT: movsd %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movsd_mem: +; ATOM: # BB#0: +; ATOM-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; ATOM-NEXT: addsd %xmm0, %xmm0 +; ATOM-NEXT: movsd %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movsd_mem: +; SLM: # BB#0: +; SLM-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero sched: [3:1.00] +; SLM-NEXT: addsd %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: movsd %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movsd_mem: +; SANDY: # BB#0: +; SANDY-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50] +; SANDY-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movsd_mem: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [4:0.50] +; HASWELL-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movsd_mem: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:1.00] +; BTVER2-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovsd %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load double, double* %a0, align 1 + %2 = fadd double %1, %1 + store double %2, double *%a1, align 1 + ret void +} + +define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) { +; GENERIC-LABEL: test_movsd_reg: +; GENERIC: # BB#0: +; GENERIC-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; GENERIC-NEXT: movapd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movsd_reg: +; ATOM: # BB#0: +; ATOM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; ATOM-NEXT: movapd %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movsd_reg: +; SLM: # BB#0: +; SLM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movsd_reg: +; SANDY: # BB#0: +; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movsd_reg: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movsd_reg: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> + ret <2 x double> %1 +} + +define void @test_movupd(<2 x double> *%a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_movupd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movupd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm0, %xmm0 +; GENERIC-NEXT: movupd %xmm0, (%rsi) +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_movupd: +; ATOM: # BB#0: +; ATOM-NEXT: movupd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm0, %xmm0 +; ATOM-NEXT: movupd %xmm0, (%rsi) +; ATOM-NEXT: retq +; +; SLM-LABEL: test_movupd: +; SLM: # BB#0: +; SLM-NEXT: movupd (%rdi), %xmm0 # sched: [3:1.00] +; SLM-NEXT: addpd %xmm0, %xmm0 # sched: [3:1.00] +; SLM-NEXT: movupd %xmm0, (%rsi) # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_movupd: +; SANDY: # BB#0: +; SANDY-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_movupd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmovupd (%rdi), %xmm0 # sched: [4:0.50] +; HASWELL-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_movupd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovupd (%rdi), %xmm0 # sched: [5:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vmovupd %xmm0, (%rsi) # sched: [1:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = load <2 x double>, <2 x double> *%a0, align 1 + %2 = fadd <2 x double> %1, %1 + store <2 x double> %2, <2 x double> *%a1, align 1 + ret void +} + +define <2 x double> @test_mulpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_mulpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: mulpd %xmm1, %xmm0 +; GENERIC-NEXT: mulpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_mulpd: +; ATOM: # BB#0: +; ATOM-NEXT: mulpd %xmm1, %xmm0 +; ATOM-NEXT: mulpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_mulpd: +; SLM: # BB#0: +; SLM-NEXT: mulpd %xmm1, %xmm0 # sched: [5:2.00] +; SLM-NEXT: mulpd (%rdi), %xmm0 # sched: [8:2.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_mulpd: +; SANDY: # BB#0: +; SANDY-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mulpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mulpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmulpd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vmulpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fmul <2 x double> %a0, %a1 + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = fmul <2 x double> %1, %2 + ret <2 x double> %3 +} + +define double @test_mulsd(double %a0, double %a1, double *%a2) { +; GENERIC-LABEL: test_mulsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: mulsd %xmm1, %xmm0 +; GENERIC-NEXT: mulsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_mulsd: +; ATOM: # BB#0: +; ATOM-NEXT: mulsd %xmm1, %xmm0 +; ATOM-NEXT: mulsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_mulsd: +; SLM: # BB#0: +; SLM-NEXT: mulsd %xmm1, %xmm0 # sched: [5:2.00] +; SLM-NEXT: mulsd (%rdi), %xmm0 # sched: [8:2.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_mulsd: +; SANDY: # BB#0: +; SANDY-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_mulsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_mulsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fmul double %a0, %a1 + %2 = load double, double *%a2, align 8 + %3 = fmul double %1, %2 + ret double %3 +} + +define <2 x double> @test_orpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_orpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: orpd %xmm1, %xmm0 +; GENERIC-NEXT: orpd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_orpd: +; ATOM: # BB#0: +; ATOM-NEXT: orpd %xmm1, %xmm0 +; ATOM-NEXT: orpd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_orpd: +; SLM: # BB#0: +; SLM-NEXT: orpd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: orpd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_orpd: +; SANDY: # BB#0: +; SANDY-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_orpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_orpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast <2 x double> %a0 to <4 x i32> + %2 = bitcast <2 x double> %a1 to <4 x i32> + %3 = or <4 x i32> %1, %2 + %4 = load <2 x double>, <2 x double> *%a2, align 16 + %5 = bitcast <2 x double> %4 to <4 x i32> + %6 = or <4 x i32> %3, %5 + %7 = bitcast <4 x i32> %6 to <2 x double> + %8 = fadd <2 x double> %a1, %7 + ret <2 x double> %8 +} + +define <8 x i16> @test_packssdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_packssdw: +; GENERIC: # BB#0: +; GENERIC-NEXT: packssdw %xmm1, %xmm0 +; GENERIC-NEXT: packssdw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_packssdw: +; ATOM: # BB#0: +; ATOM-NEXT: packssdw %xmm1, %xmm0 +; ATOM-NEXT: packssdw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_packssdw: +; SLM: # BB#0: +; SLM-NEXT: packssdw %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: packssdw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_packssdw: +; SANDY: # BB#0: +; SANDY-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_packssdw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_packssdw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpackssdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) + %2 = bitcast <8 x i16> %1 to <4 x i32> + %3 = load <4 x i32>, <4 x i32> *%a2, align 16 + %4 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %2, <4 x i32> %3) + ret <8 x i16> %4 +} +declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone + +define <16 x i8> @test_packsswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_packsswb: +; GENERIC: # BB#0: +; GENERIC-NEXT: packsswb %xmm1, %xmm0 +; GENERIC-NEXT: packsswb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_packsswb: +; ATOM: # BB#0: +; ATOM-NEXT: packsswb %xmm1, %xmm0 +; ATOM-NEXT: packsswb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_packsswb: +; SLM: # BB#0: +; SLM-NEXT: packsswb %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: packsswb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_packsswb: +; SANDY: # BB#0: +; SANDY-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_packsswb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_packsswb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpacksswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) + %2 = bitcast <16 x i8> %1 to <8 x i16> + %3 = load <8 x i16>, <8 x i16> *%a2, align 16 + %4 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %2, <8 x i16> %3) + ret <16 x i8> %4 +} +declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_packuswb: +; GENERIC: # BB#0: +; GENERIC-NEXT: packuswb %xmm1, %xmm0 +; GENERIC-NEXT: packuswb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_packuswb: +; ATOM: # BB#0: +; ATOM-NEXT: packuswb %xmm1, %xmm0 +; ATOM-NEXT: packuswb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_packuswb: +; SLM: # BB#0: +; SLM-NEXT: packuswb %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: packuswb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_packuswb: +; SANDY: # BB#0: +; SANDY-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_packuswb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_packuswb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpackuswb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) + %2 = bitcast <16 x i8> %1 to <8 x i16> + %3 = load <8 x i16>, <8 x i16> *%a2, align 16 + %4 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %2, <8 x i16> %3) + ret <16 x i8> %4 +} +declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_paddb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_paddb: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddb %xmm1, %xmm0 +; GENERIC-NEXT: paddb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddb: +; ATOM: # BB#0: +; ATOM-NEXT: paddb %xmm1, %xmm0 +; ATOM-NEXT: paddb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddb: +; SLM: # BB#0: +; SLM-NEXT: paddb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddb: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = add <16 x i8> %a0, %a1 + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = add <16 x i8> %1, %2 + ret <16 x i8> %3 +} + +define <4 x i32> @test_paddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_paddd: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: paddd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddd: +; ATOM: # BB#0: +; ATOM-NEXT: paddd %xmm1, %xmm0 +; ATOM-NEXT: paddd (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddd: +; SLM: # BB#0: +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddd: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = add <4 x i32> %a0, %a1 + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = add <4 x i32> %1, %2 + ret <4 x i32> %3 +} + +define <2 x i64> @test_paddq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_paddq: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: paddq (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddq: +; ATOM: # BB#0: +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: paddq (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddq: +; SLM: # BB#0: +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddq (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddq: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = add <2 x i64> %a0, %a1 + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = add <2 x i64> %1, %2 + ret <2 x i64> %3 +} + +define <16 x i8> @test_paddsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_paddsb: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddsb %xmm1, %xmm0 +; GENERIC-NEXT: paddsb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddsb: +; ATOM: # BB#0: +; ATOM-NEXT: paddsb %xmm1, %xmm0 +; ATOM-NEXT: paddsb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddsb: +; SLM: # BB#0: +; SLM-NEXT: paddsb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddsb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddsb: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddsb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddsb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @test_paddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_paddsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddsw %xmm1, %xmm0 +; GENERIC-NEXT: paddsw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddsw: +; ATOM: # BB#0: +; ATOM-NEXT: paddsw %xmm1, %xmm0 +; ATOM-NEXT: paddsw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddsw: +; SLM: # BB#0: +; SLM-NEXT: paddsw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddsw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddsw: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_paddusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_paddusb: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddusb %xmm1, %xmm0 +; GENERIC-NEXT: paddusb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddusb: +; ATOM: # BB#0: +; ATOM-NEXT: paddusb %xmm1, %xmm0 +; ATOM-NEXT: paddusb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddusb: +; SLM: # BB#0: +; SLM-NEXT: paddusb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddusb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddusb: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddusb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddusb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @test_paddusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_paddusw: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddusw %xmm1, %xmm0 +; GENERIC-NEXT: paddusw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddusw: +; ATOM: # BB#0: +; ATOM-NEXT: paddusw %xmm1, %xmm0 +; ATOM-NEXT: paddusw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddusw: +; SLM: # BB#0: +; SLM-NEXT: paddusw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddusw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddusw: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddusw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddusw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_paddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_paddw: +; GENERIC: # BB#0: +; GENERIC-NEXT: paddw %xmm1, %xmm0 +; GENERIC-NEXT: paddw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_paddw: +; ATOM: # BB#0: +; ATOM-NEXT: paddw %xmm1, %xmm0 +; ATOM-NEXT: paddw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_paddw: +; SLM: # BB#0: +; SLM-NEXT: paddw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: paddw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_paddw: +; SANDY: # BB#0: +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_paddw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_paddw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = add <8 x i16> %a0, %a1 + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = add <8 x i16> %1, %2 + ret <8 x i16> %3 +} + +define <2 x i64> @test_pand(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_pand: +; GENERIC: # BB#0: +; GENERIC-NEXT: pand %xmm1, %xmm0 +; GENERIC-NEXT: pand (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pand: +; ATOM: # BB#0: +; ATOM-NEXT: pand %xmm1, %xmm0 +; ATOM-NEXT: pand (%rdi), %xmm0 +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pand: +; SLM: # BB#0: +; SLM-NEXT: pand %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pand (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pand: +; SANDY: # BB#0: +; SANDY-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pand: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pand: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpand %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpand (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = and <2 x i64> %a0, %a1 + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = and <2 x i64> %1, %2 + %4 = add <2 x i64> %3, %a1 + ret <2 x i64> %4 +} + +define <2 x i64> @test_pandn(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_pandn: +; GENERIC: # BB#0: +; GENERIC-NEXT: pandn %xmm1, %xmm0 +; GENERIC-NEXT: movdqa %xmm0, %xmm1 +; GENERIC-NEXT: pandn (%rdi), %xmm1 +; GENERIC-NEXT: paddq %xmm0, %xmm1 +; GENERIC-NEXT: movdqa %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pandn: +; ATOM: # BB#0: +; ATOM-NEXT: pandn %xmm1, %xmm0 +; ATOM-NEXT: movdqa %xmm0, %xmm1 +; ATOM-NEXT: pandn (%rdi), %xmm1 +; ATOM-NEXT: paddq %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pandn: +; SLM: # BB#0: +; SLM-NEXT: pandn %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: pandn (%rdi), %xmm1 # sched: [4:1.00] +; SLM-NEXT: paddq %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pandn: +; SANDY: # BB#0: +; SANDY-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pandn: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [5:0.50] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pandn: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpandn %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpandn (%rdi), %xmm0, %xmm1 # sched: [6:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = xor <2 x i64> %a0, + %2 = and <2 x i64> %a1, %1 + %3 = load <2 x i64>, <2 x i64> *%a2, align 16 + %4 = xor <2 x i64> %2, + %5 = and <2 x i64> %3, %4 + %6 = add <2 x i64> %2, %5 + ret <2 x i64> %6 +} + +define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pavgb: +; GENERIC: # BB#0: +; GENERIC-NEXT: pavgb %xmm1, %xmm0 +; GENERIC-NEXT: pavgb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pavgb: +; ATOM: # BB#0: +; ATOM-NEXT: pavgb %xmm1, %xmm0 +; ATOM-NEXT: pavgb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pavgb: +; SLM: # BB#0: +; SLM-NEXT: pavgb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pavgb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pavgb: +; SANDY: # BB#0: +; SANDY-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pavgb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pavgb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone + +define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pavgw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pavgw %xmm1, %xmm0 +; GENERIC-NEXT: pavgw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pavgw: +; ATOM: # BB#0: +; ATOM-NEXT: pavgw %xmm1, %xmm0 +; ATOM-NEXT: pavgw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pavgw: +; SLM: # BB#0: +; SLM-NEXT: pavgw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pavgw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pavgw: +; SANDY: # BB#0: +; SANDY-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pavgw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pavgw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pcmpeqb: +; GENERIC: # BB#0: +; GENERIC-NEXT: pcmpeqb %xmm0, %xmm1 +; GENERIC-NEXT: pcmpeqb (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pcmpeqb: +; ATOM: # BB#0: +; ATOM-NEXT: pcmpeqb %xmm0, %xmm1 +; ATOM-NEXT: pcmpeqb (%rdi), %xmm0 +; ATOM-NEXT: por %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pcmpeqb: +; SLM: # BB#0: +; SLM-NEXT: pcmpeqb %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: pcmpeqb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpeqb: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpeqb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpeqb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpeqb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp eq <16 x i8> %a0, %a1 + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = icmp eq <16 x i8> %a0, %2 + %4 = or <16 x i1> %1, %3 + %5 = sext <16 x i1> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <4 x i32> @test_pcmpeqd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pcmpeqd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pcmpeqd %xmm0, %xmm1 +; GENERIC-NEXT: pcmpeqd (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pcmpeqd: +; ATOM: # BB#0: +; ATOM-NEXT: pcmpeqd %xmm0, %xmm1 +; ATOM-NEXT: pcmpeqd (%rdi), %xmm0 +; ATOM-NEXT: por %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pcmpeqd: +; SLM: # BB#0: +; SLM-NEXT: pcmpeqd %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: pcmpeqd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpeqd: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpeqd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpeqd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp eq <4 x i32> %a0, %a1 + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = icmp eq <4 x i32> %a0, %2 + %4 = or <4 x i1> %1, %3 + %5 = sext <4 x i1> %4 to <4 x i32> + ret <4 x i32> %5 +} + +define <8 x i16> @test_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pcmpeqw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pcmpeqw %xmm0, %xmm1 +; GENERIC-NEXT: pcmpeqw (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pcmpeqw: +; ATOM: # BB#0: +; ATOM-NEXT: pcmpeqw %xmm0, %xmm1 +; ATOM-NEXT: pcmpeqw (%rdi), %xmm0 +; ATOM-NEXT: por %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pcmpeqw: +; SLM: # BB#0: +; SLM-NEXT: pcmpeqw %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: pcmpeqw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpeqw: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpeqw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpeqw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpeqw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp eq <8 x i16> %a0, %a1 + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = icmp eq <8 x i16> %a0, %2 + %4 = or <8 x i1> %1, %3 + %5 = sext <8 x i1> %4 to <8 x i16> + ret <8 x i16> %5 +} + +define <16 x i8> @test_pcmpgtb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pcmpgtb: +; GENERIC: # BB#0: +; GENERIC-NEXT: movdqa %xmm0, %xmm2 +; GENERIC-NEXT: pcmpgtb %xmm1, %xmm2 +; GENERIC-NEXT: pcmpgtb (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pcmpgtb: +; ATOM: # BB#0: +; ATOM-NEXT: movdqa %xmm0, %xmm2 +; ATOM-NEXT: pcmpgtb (%rdi), %xmm0 +; ATOM-NEXT: pcmpgtb %xmm1, %xmm2 +; ATOM-NEXT: por %xmm2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pcmpgtb: +; SLM: # BB#0: +; SLM-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.50] +; SLM-NEXT: pcmpgtb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: pcmpgtb %xmm1, %xmm2 # sched: [1:0.50] +; SLM-NEXT: por %xmm2, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpgtb: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpgtb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpgtb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpgtb %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpgtb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp sgt <16 x i8> %a0, %a1 + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = icmp sgt <16 x i8> %a0, %2 + %4 = or <16 x i1> %1, %3 + %5 = sext <16 x i1> %4 to <16 x i8> + ret <16 x i8> %5 +} + +define <4 x i32> @test_pcmpgtd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pcmpgtd: +; GENERIC: # BB#0: +; GENERIC-NEXT: movdqa %xmm0, %xmm2 +; GENERIC-NEXT: pcmpgtd %xmm1, %xmm2 +; GENERIC-NEXT: pcmpeqd (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pcmpgtd: +; ATOM: # BB#0: +; ATOM-NEXT: movdqa %xmm0, %xmm2 +; ATOM-NEXT: pcmpeqd (%rdi), %xmm0 +; ATOM-NEXT: pcmpgtd %xmm1, %xmm2 +; ATOM-NEXT: por %xmm2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pcmpgtd: +; SLM: # BB#0: +; SLM-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.50] +; SLM-NEXT: pcmpeqd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: pcmpgtd %xmm1, %xmm2 # sched: [1:0.50] +; SLM-NEXT: por %xmm2, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpgtd: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpgtd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpgtd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpeqd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp sgt <4 x i32> %a0, %a1 + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = icmp eq <4 x i32> %a0, %2 + %4 = or <4 x i1> %1, %3 + %5 = sext <4 x i1> %4 to <4 x i32> + ret <4 x i32> %5 +} + +define <8 x i16> @test_pcmpgtw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pcmpgtw: +; GENERIC: # BB#0: +; GENERIC-NEXT: movdqa %xmm0, %xmm2 +; GENERIC-NEXT: pcmpgtw %xmm1, %xmm2 +; GENERIC-NEXT: pcmpgtw (%rdi), %xmm0 +; GENERIC-NEXT: por %xmm2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pcmpgtw: +; ATOM: # BB#0: +; ATOM-NEXT: movdqa %xmm0, %xmm2 +; ATOM-NEXT: pcmpgtw (%rdi), %xmm0 +; ATOM-NEXT: pcmpgtw %xmm1, %xmm2 +; ATOM-NEXT: por %xmm2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pcmpgtw: +; SLM: # BB#0: +; SLM-NEXT: movdqa %xmm0, %xmm2 # sched: [1:0.50] +; SLM-NEXT: pcmpgtw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: pcmpgtw %xmm1, %xmm2 # sched: [1:0.50] +; SLM-NEXT: por %xmm2, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pcmpgtw: +; SANDY: # BB#0: +; SANDY-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; SANDY-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pcmpgtw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; HASWELL-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pcmpgtw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpcmpgtw %xmm1, %xmm0, %xmm1 # sched: [1:0.50] +; BTVER2-NEXT: vpcmpgtw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpor %xmm0, %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = icmp sgt <8 x i16> %a0, %a1 + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = icmp sgt <8 x i16> %a0, %2 + %4 = or <8 x i1> %1, %3 + %5 = sext <8 x i1> %4 to <8 x i16> + ret <8 x i16> %5 +} + +define i16 @test_pextrw(<8 x i16> %a0) { +; GENERIC-LABEL: test_pextrw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pextrw $6, %xmm0, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pextrw: +; ATOM: # BB#0: +; ATOM-NEXT: pextrw $6, %xmm0, %eax +; ATOM-NEXT: # kill: %AX %AX %EAX +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pextrw: +; SLM: # BB#0: +; SLM-NEXT: pextrw $6, %xmm0, %eax # sched: [4:1.00] +; SLM-NEXT: # kill: %AX %AX %EAX +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pextrw: +; SANDY: # BB#0: +; SANDY-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50] +; SANDY-NEXT: # kill: %AX %AX %EAX +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pextrw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:1.00] +; HASWELL-NEXT: # kill: %AX %AX %EAX +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pextrw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpextrw $6, %xmm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: # kill: %AX %AX %EAX +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = extractelement <8 x i16> %a0, i32 6 + ret i16 %1 +} + +define <4 x i32> @test_pmaddwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pmaddwd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmaddwd %xmm1, %xmm0 +; GENERIC-NEXT: pmaddwd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmaddwd: +; ATOM: # BB#0: +; ATOM-NEXT: pmaddwd %xmm1, %xmm0 +; ATOM-NEXT: pmaddwd (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmaddwd: +; SLM: # BB#0: +; SLM-NEXT: pmaddwd %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmaddwd (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmaddwd: +; SANDY: # BB#0: +; SANDY-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmaddwd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmaddwd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmaddwd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) + %2 = bitcast <4 x i32> %1 to <8 x i16> + %3 = load <8 x i16>, <8 x i16> *%a2, align 16 + %4 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %2, <8 x i16> %3) + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_pmaxsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pmaxsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmaxsw %xmm1, %xmm0 +; GENERIC-NEXT: pmaxsw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmaxsw: +; ATOM: # BB#0: +; ATOM-NEXT: pmaxsw %xmm1, %xmm0 +; ATOM-NEXT: pmaxsw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmaxsw: +; SLM: # BB#0: +; SLM-NEXT: pmaxsw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pmaxsw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmaxsw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmaxsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmaxsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpmaxsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_pmaxub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pmaxub: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmaxub %xmm1, %xmm0 +; GENERIC-NEXT: pmaxub (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmaxub: +; ATOM: # BB#0: +; ATOM-NEXT: pmaxub %xmm1, %xmm0 +; ATOM-NEXT: pmaxub (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmaxub: +; SLM: # BB#0: +; SLM-NEXT: pmaxub %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pmaxub (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmaxub: +; SANDY: # BB#0: +; SANDY-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmaxub: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmaxub: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpmaxub (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @test_pminsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pminsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pminsw %xmm1, %xmm0 +; GENERIC-NEXT: pminsw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pminsw: +; ATOM: # BB#0: +; ATOM-NEXT: pminsw %xmm1, %xmm0 +; ATOM-NEXT: pminsw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pminsw: +; SLM: # BB#0: +; SLM-NEXT: pminsw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pminsw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pminsw: +; SANDY: # BB#0: +; SANDY-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pminsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pminsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpminsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_pminub(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_pminub: +; GENERIC: # BB#0: +; GENERIC-NEXT: pminub %xmm1, %xmm0 +; GENERIC-NEXT: pminub (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pminub: +; ATOM: # BB#0: +; ATOM-NEXT: pminub %xmm1, %xmm0 +; ATOM-NEXT: pminub (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pminub: +; SLM: # BB#0: +; SLM-NEXT: pminub %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pminub (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pminub: +; SANDY: # BB#0: +; SANDY-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pminub: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pminub: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpminub %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpminub (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone + +define i32 @test_pmovmskb(<16 x i8> %a0) { +; GENERIC-LABEL: test_pmovmskb: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmovmskb %xmm0, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmovmskb: +; ATOM: # BB#0: +; ATOM-NEXT: pmovmskb %xmm0, %eax +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmovmskb: +; SLM: # BB#0: +; SLM-NEXT: pmovmskb %xmm0, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmovmskb: +; SANDY: # BB#0: +; SANDY-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmovmskb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmovmskb %xmm0, %eax # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmovmskb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmovmskb %xmm0, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) + ret i32 %1 +} +declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone + +define <8 x i16> @test_pmulhuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pmulhuw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmulhuw %xmm1, %xmm0 +; GENERIC-NEXT: pmulhuw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmulhuw: +; ATOM: # BB#0: +; ATOM-NEXT: pmulhuw %xmm1, %xmm0 +; ATOM-NEXT: pmulhuw (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmulhuw: +; SLM: # BB#0: +; SLM-NEXT: pmulhuw %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmulhuw (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmulhuw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmulhuw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmulhuw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmulhuw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_pmulhw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pmulhw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmulhw %xmm1, %xmm0 +; GENERIC-NEXT: pmulhw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmulhw: +; ATOM: # BB#0: +; ATOM-NEXT: pmulhw %xmm1, %xmm0 +; ATOM-NEXT: pmulhw (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmulhw: +; SLM: # BB#0: +; SLM-NEXT: pmulhw %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmulhw (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmulhw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmulhw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmulhw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmulhw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_pmullw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_pmullw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmullw %xmm1, %xmm0 +; GENERIC-NEXT: pmullw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmullw: +; ATOM: # BB#0: +; ATOM-NEXT: pmullw %xmm1, %xmm0 +; ATOM-NEXT: pmullw (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmullw: +; SLM: # BB#0: +; SLM-NEXT: pmullw %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmullw (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmullw: +; SANDY: # BB#0: +; SANDY-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmullw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmullw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmullw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = mul <8 x i16> %a0, %a1 + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = mul <8 x i16> %1, %2 + ret <8 x i16> %3 +} + +define <2 x i64> @test_pmuludq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pmuludq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pmuludq %xmm1, %xmm0 +; GENERIC-NEXT: pmuludq (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pmuludq: +; ATOM: # BB#0: +; ATOM-NEXT: pmuludq %xmm1, %xmm0 +; ATOM-NEXT: pmuludq (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pmuludq: +; SLM: # BB#0: +; SLM-NEXT: pmuludq %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmuludq (%rdi), %xmm0 # sched: [7:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pmuludq: +; SANDY: # BB#0: +; SANDY-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pmuludq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pmuludq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpmuludq (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) + %2 = bitcast <2 x i64> %1 to <4 x i32> + %3 = load <4 x i32>, <4 x i32> *%a2, align 16 + %4 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %2, <4 x i32> %3) + ret <2 x i64> %4 +} +declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone + +define <2 x i64> @test_por(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_por: +; GENERIC: # BB#0: +; GENERIC-NEXT: por %xmm1, %xmm0 +; GENERIC-NEXT: por (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_por: +; ATOM: # BB#0: +; ATOM-NEXT: por %xmm1, %xmm0 +; ATOM-NEXT: por (%rdi), %xmm0 +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_por: +; SLM: # BB#0: +; SLM-NEXT: por %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: por (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_por: +; SANDY: # BB#0: +; SANDY-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_por: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_por: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpor (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = or <2 x i64> %a0, %a1 + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = or <2 x i64> %1, %2 + %4 = add <2 x i64> %3, %a1 + ret <2 x i64> %4 +} + +define <2 x i64> @test_psadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_psadbw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psadbw %xmm1, %xmm0 +; GENERIC-NEXT: psadbw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psadbw: +; ATOM: # BB#0: +; ATOM-NEXT: psadbw %xmm1, %xmm0 +; ATOM-NEXT: psadbw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psadbw: +; SLM: # BB#0: +; SLM-NEXT: psadbw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psadbw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psadbw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; SANDY-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psadbw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psadbw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; BTVER2-NEXT: vpsadbw (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) + %2 = bitcast <2 x i64> %1 to <16 x i8> + %3 = load <16 x i8>, <16 x i8> *%a2, align 16 + %4 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %2, <16 x i8> %3) + ret <2 x i64> %4 +} +declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone + +define <4 x i32> @test_pshufd(<4 x i32> %a0, <4 x i32> *%a1) { +; GENERIC-LABEL: test_pshufd: +; GENERIC: # BB#0: +; GENERIC-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] +; GENERIC-NEXT: pshufd {{.*#+}} xmm0 = mem[3,2,1,0] +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pshufd: +; ATOM: # BB#0: +; ATOM-NEXT: pshufd {{.*#+}} xmm1 = mem[3,2,1,0] +; ATOM-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] +; ATOM-NEXT: paddd %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pshufd: +; SLM: # BB#0: +; SLM-NEXT: pshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [4:1.00] +; SLM-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00] +; SLM-NEXT: paddd %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pshufd: +; SANDY: # BB#0: +; SANDY-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50] +; SANDY-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:0.50] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pshufd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:1.00] +; HASWELL-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [5:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pshufd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpshufd {{.*#+}} xmm1 = mem[3,2,1,0] sched: [6:1.00] +; BTVER2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,3,2] sched: [1:0.50] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> + %2 = load <4 x i32>, <4 x i32> *%a1, align 16 + %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> + %4 = add <4 x i32> %1, %3 + ret <4 x i32> %4 +} + +define <8 x i16> @test_pshufhw(<8 x i16> %a0, <8 x i16> *%a1) { +; GENERIC-LABEL: test_pshufhw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,4,7,6] +; GENERIC-NEXT: pshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,6,5,4] +; GENERIC-NEXT: paddw %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pshufhw: +; ATOM: # BB#0: +; ATOM-NEXT: pshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] +; ATOM-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] +; ATOM-NEXT: paddw %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pshufhw: +; SLM: # BB#0: +; SLM-NEXT: pshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [4:1.00] +; SLM-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00] +; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pshufhw: +; SANDY: # BB#0: +; SANDY-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50] +; SANDY-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:0.50] +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pshufhw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:1.00] +; HASWELL-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [5:1.00] +; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pshufhw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpshufhw {{.*#+}} xmm1 = mem[0,1,2,3,7,6,5,4] sched: [6:1.00] +; BTVER2-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] sched: [1:0.50] +; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + %2 = load <8 x i16>, <8 x i16> *%a1, align 16 + %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> + %4 = add <8 x i16> %1, %3 + ret <8 x i16> %4 +} + +define <8 x i16> @test_pshuflw(<8 x i16> %a0, <8 x i16> *%a1) { +; GENERIC-LABEL: test_pshuflw: +; GENERIC: # BB#0: +; GENERIC-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7] +; GENERIC-NEXT: pshuflw {{.*#+}} xmm0 = mem[3,2,1,0,4,5,6,7] +; GENERIC-NEXT: paddw %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pshuflw: +; ATOM: # BB#0: +; ATOM-NEXT: pshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] +; ATOM-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] +; ATOM-NEXT: paddw %xmm0, %xmm1 +; ATOM-NEXT: movdqa %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pshuflw: +; SLM: # BB#0: +; SLM-NEXT: pshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [4:1.00] +; SLM-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00] +; SLM-NEXT: paddw %xmm0, %xmm1 # sched: [1:0.50] +; SLM-NEXT: movdqa %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pshuflw: +; SANDY: # BB#0: +; SANDY-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50] +; SANDY-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:0.50] +; SANDY-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pshuflw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:1.00] +; HASWELL-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [5:1.00] +; HASWELL-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pshuflw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpshuflw {{.*#+}} xmm1 = mem[3,2,1,0,4,5,6,7] sched: [6:1.00] +; BTVER2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] sched: [1:0.50] +; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + %2 = load <8 x i16>, <8 x i16> *%a1, align 16 + %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> + %4 = add <8 x i16> %1, %3 + ret <8 x i16> %4 +} + +define <4 x i32> @test_pslld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_pslld: +; GENERIC: # BB#0: +; GENERIC-NEXT: pslld %xmm1, %xmm0 +; GENERIC-NEXT: pslld (%rdi), %xmm0 +; GENERIC-NEXT: pslld $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pslld: +; ATOM: # BB#0: +; ATOM-NEXT: pslld %xmm1, %xmm0 +; ATOM-NEXT: pslld (%rdi), %xmm0 +; ATOM-NEXT: pslld $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pslld: +; SLM: # BB#0: +; SLM-NEXT: pslld %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: pslld (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: pslld $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pslld: +; SANDY: # BB#0: +; SANDY-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pslld: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pslld: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpslld %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpslld (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpslld $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %1, <4 x i32> %2) + %4 = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %3, i32 2) + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone + +define <4 x i32> @test_pslldq(<4 x i32> %a0) { +; GENERIC-LABEL: test_pslldq: +; GENERIC: # BB#0: +; GENERIC-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pslldq: +; ATOM: # BB#0: +; ATOM-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pslldq: +; SLM: # BB#0: +; SLM-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pslldq: +; SANDY: # BB#0: +; SANDY-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pslldq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pslldq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11] sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %1 +} + +define <2 x i64> @test_psllq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_psllq: +; GENERIC: # BB#0: +; GENERIC-NEXT: psllq %xmm1, %xmm0 +; GENERIC-NEXT: psllq (%rdi), %xmm0 +; GENERIC-NEXT: psllq $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psllq: +; ATOM: # BB#0: +; ATOM-NEXT: psllq %xmm1, %xmm0 +; ATOM-NEXT: psllq (%rdi), %xmm0 +; ATOM-NEXT: psllq $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psllq: +; SLM: # BB#0: +; SLM-NEXT: psllq %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psllq (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psllq $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psllq: +; SANDY: # BB#0: +; SANDY-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psllq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psllq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsllq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsllq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsllq $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %1, <2 x i64> %2) + %4 = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %3, i32 2) + ret <2 x i64> %4 +} +declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone +declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone + +define <8 x i16> @test_psllw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psllw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psllw %xmm1, %xmm0 +; GENERIC-NEXT: psllw (%rdi), %xmm0 +; GENERIC-NEXT: psllw $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psllw: +; ATOM: # BB#0: +; ATOM-NEXT: psllw %xmm1, %xmm0 +; ATOM-NEXT: psllw (%rdi), %xmm0 +; ATOM-NEXT: psllw $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psllw: +; SLM: # BB#0: +; SLM-NEXT: psllw %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psllw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psllw $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psllw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psllw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psllw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsllw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsllw $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %1, <8 x i16> %2) + %4 = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %3, i32 2) + ret <8 x i16> %4 +} +declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone + +define <4 x i32> @test_psrad(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_psrad: +; GENERIC: # BB#0: +; GENERIC-NEXT: psrad %xmm1, %xmm0 +; GENERIC-NEXT: psrad (%rdi), %xmm0 +; GENERIC-NEXT: psrad $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psrad: +; ATOM: # BB#0: +; ATOM-NEXT: psrad %xmm1, %xmm0 +; ATOM-NEXT: psrad (%rdi), %xmm0 +; ATOM-NEXT: psrad $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psrad: +; SLM: # BB#0: +; SLM-NEXT: psrad %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psrad (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psrad $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psrad: +; SANDY: # BB#0: +; SANDY-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psrad: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psrad: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsrad %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsrad (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsrad $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> %2) + %4 = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %3, i32 2) + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone + +define <8 x i16> @test_psraw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psraw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psraw %xmm1, %xmm0 +; GENERIC-NEXT: psraw (%rdi), %xmm0 +; GENERIC-NEXT: psraw $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psraw: +; ATOM: # BB#0: +; ATOM-NEXT: psraw %xmm1, %xmm0 +; ATOM-NEXT: psraw (%rdi), %xmm0 +; ATOM-NEXT: psraw $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psraw: +; SLM: # BB#0: +; SLM-NEXT: psraw %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psraw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psraw $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psraw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psraw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psraw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsraw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsraw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsraw $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> %2) + %4 = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %3, i32 2) + ret <8 x i16> %4 +} +declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone + +define <4 x i32> @test_psrld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_psrld: +; GENERIC: # BB#0: +; GENERIC-NEXT: psrld %xmm1, %xmm0 +; GENERIC-NEXT: psrld (%rdi), %xmm0 +; GENERIC-NEXT: psrld $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psrld: +; ATOM: # BB#0: +; ATOM-NEXT: psrld %xmm1, %xmm0 +; ATOM-NEXT: psrld (%rdi), %xmm0 +; ATOM-NEXT: psrld $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psrld: +; SLM: # BB#0: +; SLM-NEXT: psrld %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psrld (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psrld $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psrld: +; SANDY: # BB#0: +; SANDY-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psrld: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psrld: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsrld %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsrld (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsrld $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %1, <4 x i32> %2) + %4 = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %3, i32 2) + ret <4 x i32> %4 +} +declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone +declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone + +define <4 x i32> @test_psrldq(<4 x i32> %a0) { +; GENERIC-LABEL: test_psrldq: +; GENERIC: # BB#0: +; GENERIC-NEXT: psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psrldq: +; ATOM: # BB#0: +; ATOM-NEXT: psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psrldq: +; SLM: # BB#0: +; SLM-NEXT: psrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psrldq: +; SANDY: # BB#0: +; SANDY-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psrldq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psrldq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %1 +} + +define <2 x i64> @test_psrlq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_psrlq: +; GENERIC: # BB#0: +; GENERIC-NEXT: psrlq %xmm1, %xmm0 +; GENERIC-NEXT: psrlq (%rdi), %xmm0 +; GENERIC-NEXT: psrlq $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psrlq: +; ATOM: # BB#0: +; ATOM-NEXT: psrlq %xmm1, %xmm0 +; ATOM-NEXT: psrlq (%rdi), %xmm0 +; ATOM-NEXT: psrlq $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psrlq: +; SLM: # BB#0: +; SLM-NEXT: psrlq %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psrlq (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psrlq $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psrlq: +; SANDY: # BB#0: +; SANDY-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psrlq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psrlq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsrlq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsrlq $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %1, <2 x i64> %2) + %4 = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %3, i32 2) + ret <2 x i64> %4 +} +declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone +declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone + +define <8 x i16> @test_psrlw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psrlw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psrlw %xmm1, %xmm0 +; GENERIC-NEXT: psrlw (%rdi), %xmm0 +; GENERIC-NEXT: psrlw $2, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psrlw: +; ATOM: # BB#0: +; ATOM-NEXT: psrlw %xmm1, %xmm0 +; ATOM-NEXT: psrlw (%rdi), %xmm0 +; ATOM-NEXT: psrlw $2, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psrlw: +; SLM: # BB#0: +; SLM-NEXT: psrlw %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: psrlw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: psrlw $2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psrlw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psrlw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [2:1.00] +; HASWELL-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psrlw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsrlw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpsrlw $2, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %1, <8 x i16> %2) + %4 = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %3, i32 2) + ret <8 x i16> %4 +} +declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone + +define <16 x i8> @test_psubb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_psubb: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubb %xmm1, %xmm0 +; GENERIC-NEXT: psubb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubb: +; ATOM: # BB#0: +; ATOM-NEXT: psubb %xmm1, %xmm0 +; ATOM-NEXT: psubb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubb: +; SLM: # BB#0: +; SLM-NEXT: psubb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubb: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sub <16 x i8> %a0, %a1 + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = sub <16 x i8> %1, %2 + ret <16 x i8> %3 +} + +define <4 x i32> @test_psubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_psubd: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubd %xmm1, %xmm0 +; GENERIC-NEXT: psubd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubd: +; ATOM: # BB#0: +; ATOM-NEXT: psubd %xmm1, %xmm0 +; ATOM-NEXT: psubd (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubd: +; SLM: # BB#0: +; SLM-NEXT: psubd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubd: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sub <4 x i32> %a0, %a1 + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = sub <4 x i32> %1, %2 + ret <4 x i32> %3 +} + +define <2 x i64> @test_psubq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_psubq: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubq %xmm1, %xmm0 +; GENERIC-NEXT: psubq (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubq: +; ATOM: # BB#0: +; ATOM-NEXT: psubq %xmm1, %xmm0 +; ATOM-NEXT: psubq (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubq: +; SLM: # BB#0: +; SLM-NEXT: psubq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubq (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubq: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubq (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sub <2 x i64> %a0, %a1 + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = sub <2 x i64> %1, %2 + ret <2 x i64> %3 +} + +define <16 x i8> @test_psubsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_psubsb: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubsb %xmm1, %xmm0 +; GENERIC-NEXT: psubsb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubsb: +; ATOM: # BB#0: +; ATOM-NEXT: psubsb %xmm1, %xmm0 +; ATOM-NEXT: psubsb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubsb: +; SLM: # BB#0: +; SLM-NEXT: psubsb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubsb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubsb: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubsb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubsb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @test_psubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psubsw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubsw %xmm1, %xmm0 +; GENERIC-NEXT: psubsw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubsw: +; ATOM: # BB#0: +; ATOM-NEXT: psubsw %xmm1, %xmm0 +; ATOM-NEXT: psubsw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubsw: +; SLM: # BB#0: +; SLM-NEXT: psubsw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubsw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubsw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubsw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubsw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <16 x i8> @test_psubusb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_psubusb: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubusb %xmm1, %xmm0 +; GENERIC-NEXT: psubusb (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubusb: +; ATOM: # BB#0: +; ATOM-NEXT: psubusb %xmm1, %xmm0 +; ATOM-NEXT: psubusb (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubusb: +; SLM: # BB#0: +; SLM-NEXT: psubusb %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubusb (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubusb: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubusb: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubusb: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubusb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubusb (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %1, <16 x i8> %2) + ret <16 x i8> %3 +} +declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @test_psubusw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psubusw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubusw %xmm1, %xmm0 +; GENERIC-NEXT: psubusw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubusw: +; ATOM: # BB#0: +; ATOM-NEXT: psubusw %xmm1, %xmm0 +; ATOM-NEXT: psubusw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubusw: +; SLM: # BB#0: +; SLM-NEXT: psubusw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubusw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubusw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubusw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubusw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubusw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %1, <8 x i16> %2) + ret <8 x i16> %3 +} +declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @test_psubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_psubw: +; GENERIC: # BB#0: +; GENERIC-NEXT: psubw %xmm1, %xmm0 +; GENERIC-NEXT: psubw (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_psubw: +; ATOM: # BB#0: +; ATOM-NEXT: psubw %xmm1, %xmm0 +; ATOM-NEXT: psubw (%rdi), %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_psubw: +; SLM: # BB#0: +; SLM-NEXT: psubw %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: psubw (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_psubw: +; SANDY: # BB#0: +; SANDY-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_psubw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_psubw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = sub <8 x i16> %a0, %a1 + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = sub <8 x i16> %1, %2 + ret <8 x i16> %3 +} + +define <16 x i8> @test_punpckhbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_punpckhbw: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; GENERIC-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpckhbw: +; ATOM: # BB#0: +; ATOM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; ATOM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpckhbw: +; SLM: # BB#0: +; SLM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00] +; SLM-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpckhbw: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50] +; SANDY-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpckhbw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:1.00] +; HASWELL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpckhbw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] sched: [1:0.50] +; BTVER2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],mem[8],xmm0[9],mem[9],xmm0[10],mem[10],xmm0[11],mem[11],xmm0[12],mem[12],xmm0[13],mem[13],xmm0[14],mem[14],xmm0[15],mem[15] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> + ret <16 x i8> %3 +} + +define <4 x i32> @test_punpckhdq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_punpckhdq: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpckhdq: +; ATOM: # BB#0: +; ATOM-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ATOM-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] +; ATOM-NEXT: paddd %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpckhdq: +; SLM: # BB#0: +; SLM-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SLM-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [4:1.00] +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpckhdq: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] +; SANDY-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:0.50] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpckhdq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; HASWELL-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [5:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpckhdq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] +; BTVER2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm1[2],mem[2],xmm1[3],mem[3] sched: [6:1.00] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> + %4 = add <4 x i32> %1, %3 + ret <4 x i32> %4 +} + +define <2 x i64> @test_punpckhqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_punpckhqdq: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; GENERIC-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpckhqdq: +; ATOM: # BB#0: +; ATOM-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; ATOM-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpckhqdq: +; SLM: # BB#0: +; SLM-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; SLM-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [4:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpckhqdq: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50] +; SANDY-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpckhqdq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; HASWELL-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpckhqdq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50] +; BTVER2-NEXT: vpunpckhqdq {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> + %4 = add <2 x i64> %1, %3 + ret <2 x i64> %4 +} + +define <8 x i16> @test_punpckhwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_punpckhwd: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; GENERIC-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpckhwd: +; ATOM: # BB#0: +; ATOM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; ATOM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpckhwd: +; SLM: # BB#0: +; SLM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00] +; SLM-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpckhwd: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] +; SANDY-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpckhwd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00] +; HASWELL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpckhwd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] +; BTVER2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ret <8 x i16> %3 +} + +define <16 x i8> @test_punpcklbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) { +; GENERIC-LABEL: test_punpcklbw: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; GENERIC-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpcklbw: +; ATOM: # BB#0: +; ATOM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; ATOM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpcklbw: +; SLM: # BB#0: +; SLM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00] +; SLM-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpcklbw: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] +; SANDY-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpcklbw: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:1.00] +; HASWELL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpcklbw: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] sched: [1:0.50] +; BTVER2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> + %2 = load <16 x i8>, <16 x i8> *%a2, align 16 + %3 = shufflevector <16 x i8> %1, <16 x i8> %2, <16 x i32> + ret <16 x i8> %3 +} + +define <4 x i32> @test_punpckldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { +; GENERIC-LABEL: test_punpckldq: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; GENERIC-NEXT: paddd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpckldq: +; ATOM: # BB#0: +; ATOM-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] +; ATOM-NEXT: paddd %xmm1, %xmm0 +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpckldq: +; SLM: # BB#0: +; SLM-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SLM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [4:1.00] +; SLM-NEXT: paddd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpckldq: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50] +; SANDY-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:0.50] +; SANDY-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpckldq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; HASWELL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [5:1.00] +; HASWELL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpckldq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50] +; BTVER2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] sched: [6:1.00] +; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> + %2 = load <4 x i32>, <4 x i32> *%a2, align 16 + %3 = shufflevector <4 x i32> %a1, <4 x i32> %2, <4 x i32> + %4 = add <4 x i32> %1, %3 + ret <4 x i32> %4 +} + +define <2 x i64> @test_punpcklqdq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_punpcklqdq: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; GENERIC-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpcklqdq: +; ATOM: # BB#0: +; ATOM-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; ATOM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpcklqdq: +; SLM: # BB#0: +; SLM-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; SLM-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpcklqdq: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; SANDY-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpcklqdq: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; HASWELL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpcklqdq: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; BTVER2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <2 x i32> + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = shufflevector <2 x i64> %a1, <2 x i64> %2, <2x i32> + %4 = add <2 x i64> %1, %3 + ret <2 x i64> %4 +} + +define <8 x i16> @test_punpcklwd(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) { +; GENERIC-LABEL: test_punpcklwd: +; GENERIC: # BB#0: +; GENERIC-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_punpcklwd: +; ATOM: # BB#0: +; ATOM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; ATOM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: nop +; ATOM-NEXT: retq +; +; SLM-LABEL: test_punpcklwd: +; SLM: # BB#0: +; SLM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SLM-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [4:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_punpcklwd: +; SANDY: # BB#0: +; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] +; SANDY-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_punpcklwd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; HASWELL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [5:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_punpcklwd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50] +; BTVER2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> + %2 = load <8 x i16>, <8 x i16> *%a2, align 16 + %3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> + ret <8 x i16> %3 +} + +define <2 x i64> @test_pxor(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) { +; GENERIC-LABEL: test_pxor: +; GENERIC: # BB#0: +; GENERIC-NEXT: pxor %xmm1, %xmm0 +; GENERIC-NEXT: pxor (%rdi), %xmm0 +; GENERIC-NEXT: paddq %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_pxor: +; ATOM: # BB#0: +; ATOM-NEXT: pxor %xmm1, %xmm0 +; ATOM-NEXT: pxor (%rdi), %xmm0 +; ATOM-NEXT: paddq %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_pxor: +; SLM: # BB#0: +; SLM-NEXT: pxor %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: pxor (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: paddq %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_pxor: +; SANDY: # BB#0: +; SANDY-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_pxor: +; HASWELL: # BB#0: +; HASWELL-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; HASWELL-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; HASWELL-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_pxor: +; BTVER2: # BB#0: +; BTVER2-NEXT: vpxor %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vpxor (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = xor <2 x i64> %a0, %a1 + %2 = load <2 x i64>, <2 x i64> *%a2, align 16 + %3 = xor <2 x i64> %1, %2 + %4 = add <2 x i64> %3, %a1 + ret <2 x i64> %4 +} + +define <2 x double> @test_shufpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_shufpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; GENERIC-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_shufpd: +; ATOM: # BB#0: +; ATOM-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; ATOM-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_shufpd: +; SLM: # BB#0: +; SLM-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00] +; SLM-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [4:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_shufpd: +; SANDY: # BB#0: +; SANDY-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00] +; SANDY-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_shufpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:1.00] +; HASWELL-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_shufpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] sched: [1:0.50] +; BTVER2-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1],mem[0] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} + +define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_sqrtpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: sqrtpd %xmm0, %xmm1 +; GENERIC-NEXT: sqrtpd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_sqrtpd: +; ATOM: # BB#0: +; ATOM-NEXT: sqrtpd %xmm0, %xmm1 +; ATOM-NEXT: sqrtpd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_sqrtpd: +; SLM: # BB#0: +; SLM-NEXT: sqrtpd (%rdi), %xmm1 # sched: [18:1.00] +; SLM-NEXT: sqrtpd %xmm0, %xmm0 # sched: [15:1.00] +; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_sqrtpd: +; SANDY: # BB#0: +; SANDY-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00] +; SANDY-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_sqrtpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [15:1.00] +; HASWELL-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [19:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_sqrtpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsqrtpd (%rdi), %xmm1 # sched: [26:21.00] +; BTVER2-NEXT: vsqrtpd %xmm0, %xmm0 # sched: [21:21.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) + %2 = load <2 x double>, <2 x double> *%a1, align 16 + %3 = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %2) + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + +; TODO - sqrtsd_m + +define <2 x double> @test_sqrtsd(<2 x double> %a0, <2 x double> *%a1) { +; GENERIC-LABEL: test_sqrtsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: sqrtsd %xmm0, %xmm0 +; GENERIC-NEXT: movapd (%rdi), %xmm1 +; GENERIC-NEXT: sqrtsd %xmm1, %xmm1 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_sqrtsd: +; ATOM: # BB#0: +; ATOM-NEXT: movapd (%rdi), %xmm1 +; ATOM-NEXT: sqrtsd %xmm0, %xmm0 +; ATOM-NEXT: sqrtsd %xmm1, %xmm1 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_sqrtsd: +; SLM: # BB#0: +; SLM-NEXT: movapd (%rdi), %xmm1 # sched: [3:1.00] +; SLM-NEXT: sqrtsd %xmm0, %xmm0 # sched: [18:1.00] +; SLM-NEXT: sqrtsd %xmm1, %xmm1 # sched: [18:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_sqrtsd: +; SANDY: # BB#0: +; SANDY-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00] +; SANDY-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50] +; SANDY-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_sqrtsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [19:1.00] +; HASWELL-NEXT: vmovapd (%rdi), %xmm1 # sched: [4:0.50] +; HASWELL-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [19:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_sqrtsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vmovapd (%rdi), %xmm1 # sched: [5:1.00] +; BTVER2-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [26:21.00] +; BTVER2-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 # sched: [26:21.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) + %2 = load <2 x double>, <2 x double> *%a1, align 16 + %3 = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %2) + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} +declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone + +define <2 x double> @test_subpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_subpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: subpd %xmm1, %xmm0 +; GENERIC-NEXT: subpd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_subpd: +; ATOM: # BB#0: +; ATOM-NEXT: subpd %xmm1, %xmm0 +; ATOM-NEXT: subpd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_subpd: +; SLM: # BB#0: +; SLM-NEXT: subpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: subpd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_subpd: +; SANDY: # BB#0: +; SANDY-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_subpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_subpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fsub <2 x double> %a0, %a1 + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = fsub <2 x double> %1, %2 + ret <2 x double> %3 +} + +define double @test_subsd(double %a0, double %a1, double *%a2) { +; GENERIC-LABEL: test_subsd: +; GENERIC: # BB#0: +; GENERIC-NEXT: subsd %xmm1, %xmm0 +; GENERIC-NEXT: subsd (%rdi), %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_subsd: +; ATOM: # BB#0: +; ATOM-NEXT: subsd %xmm1, %xmm0 +; ATOM-NEXT: subsd (%rdi), %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_subsd: +; SLM: # BB#0: +; SLM-NEXT: subsd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: subsd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_subsd: +; SANDY: # BB#0: +; SANDY-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_subsd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_subsd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = fsub double %a0, %a1 + %2 = load double, double *%a2, align 8 + %3 = fsub double %1, %2 + ret double %3 +} + +define i32 @test_ucomisd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_ucomisd: +; GENERIC: # BB#0: +; GENERIC-NEXT: ucomisd %xmm1, %xmm0 +; GENERIC-NEXT: setnp %al +; GENERIC-NEXT: sete %cl +; GENERIC-NEXT: andb %al, %cl +; GENERIC-NEXT: ucomisd (%rdi), %xmm0 +; GENERIC-NEXT: setnp %al +; GENERIC-NEXT: sete %dl +; GENERIC-NEXT: andb %al, %dl +; GENERIC-NEXT: orb %cl, %dl +; GENERIC-NEXT: movzbl %dl, %eax +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_ucomisd: +; ATOM: # BB#0: +; ATOM-NEXT: ucomisd %xmm1, %xmm0 +; ATOM-NEXT: setnp %al +; ATOM-NEXT: sete %cl +; ATOM-NEXT: andb %al, %cl +; ATOM-NEXT: ucomisd (%rdi), %xmm0 +; ATOM-NEXT: setnp %al +; ATOM-NEXT: sete %dl +; ATOM-NEXT: andb %al, %dl +; ATOM-NEXT: orb %cl, %dl +; ATOM-NEXT: movzbl %dl, %eax +; ATOM-NEXT: retq +; +; SLM-LABEL: test_ucomisd: +; SLM: # BB#0: +; SLM-NEXT: ucomisd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: setnp %al # sched: [1:0.50] +; SLM-NEXT: sete %cl # sched: [1:0.50] +; SLM-NEXT: andb %al, %cl # sched: [1:0.50] +; SLM-NEXT: ucomisd (%rdi), %xmm0 # sched: [6:1.00] +; SLM-NEXT: setnp %al # sched: [1:0.50] +; SLM-NEXT: sete %dl # sched: [1:0.50] +; SLM-NEXT: andb %al, %dl # sched: [1:0.50] +; SLM-NEXT: orb %cl, %dl # sched: [1:0.50] +; SLM-NEXT: movzbl %dl, %eax # sched: [1:0.50] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_ucomisd: +; SANDY: # BB#0: +; SANDY-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: setnp %al # sched: [1:0.33] +; SANDY-NEXT: sete %cl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %cl # sched: [1:0.33] +; SANDY-NEXT: vucomisd (%rdi), %xmm0 # sched: [7:1.00] +; SANDY-NEXT: setnp %al # sched: [1:0.33] +; SANDY-NEXT: sete %dl # sched: [1:0.33] +; SANDY-NEXT: andb %al, %dl # sched: [1:0.33] +; SANDY-NEXT: orb %cl, %dl # sched: [1:0.33] +; SANDY-NEXT: movzbl %dl, %eax # sched: [1:0.33] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_ucomisd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: setnp %al # sched: [1:0.50] +; HASWELL-NEXT: sete %cl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %cl # sched: [1:0.25] +; HASWELL-NEXT: vucomisd (%rdi), %xmm0 # sched: [7:1.00] +; HASWELL-NEXT: setnp %al # sched: [1:0.50] +; HASWELL-NEXT: sete %dl # sched: [1:0.50] +; HASWELL-NEXT: andb %al, %dl # sched: [1:0.25] +; HASWELL-NEXT: orb %cl, %dl # sched: [1:0.25] +; HASWELL-NEXT: movzbl %dl, %eax # sched: [1:0.25] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_ucomisd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vucomisd %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: setnp %al # sched: [1:0.50] +; BTVER2-NEXT: sete %cl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50] +; BTVER2-NEXT: vucomisd (%rdi), %xmm0 # sched: [8:1.00] +; BTVER2-NEXT: setnp %al # sched: [1:0.50] +; BTVER2-NEXT: sete %dl # sched: [1:0.50] +; BTVER2-NEXT: andb %al, %dl # sched: [1:0.50] +; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50] +; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) + %2 = load <2 x double>, <2 x double> *%a2, align 8 + %3 = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %2) + %4 = or i32 %1, %3 + ret i32 %4 +} +declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone + +define <2 x double> @test_unpckhpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_unpckhpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; GENERIC-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_unpckhpd: +; ATOM: # BB#0: +; ATOM-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; ATOM-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_unpckhpd: +; SLM: # BB#0: +; SLM-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; SLM-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [4:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_unpckhpd: +; SANDY: # BB#0: +; SANDY-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; SANDY-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpckhpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; HASWELL-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpckhpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:0.50] +; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],mem[1] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = shufflevector <2 x double> %a1, <2 x double> %2, <2 x i32> + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} + +define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_unpcklpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; GENERIC-NEXT: movapd %xmm0, %xmm1 +; GENERIC-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; GENERIC-NEXT: addpd %xmm0, %xmm1 +; GENERIC-NEXT: movapd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_unpcklpd: +; ATOM: # BB#0: +; ATOM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; ATOM-NEXT: movapd %xmm0, %xmm1 +; ATOM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] +; ATOM-NEXT: addpd %xmm0, %xmm1 +; ATOM-NEXT: movapd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_unpcklpd: +; SLM: # BB#0: +; SLM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; SLM-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00] +; SLM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00] +; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_unpcklpd: +; SANDY: # BB#0: +; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00] +; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_unpcklpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_unpcklpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] +; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> + %2 = load <2 x double>, <2 x double> *%a2, align 16 + %3 = shufflevector <2 x double> %1, <2 x double> %2, <2 x i32> + %4 = fadd <2 x double> %1, %3 + ret <2 x double> %4 +} + +define <2 x double> @test_xorpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { +; GENERIC-LABEL: test_xorpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: xorpd %xmm1, %xmm0 +; GENERIC-NEXT: xorpd (%rdi), %xmm0 +; GENERIC-NEXT: addpd %xmm1, %xmm0 +; GENERIC-NEXT: retq +; +; ATOM-LABEL: test_xorpd: +; ATOM: # BB#0: +; ATOM-NEXT: xorpd %xmm1, %xmm0 +; ATOM-NEXT: xorpd (%rdi), %xmm0 +; ATOM-NEXT: addpd %xmm1, %xmm0 +; ATOM-NEXT: retq +; +; SLM-LABEL: test_xorpd: +; SLM: # BB#0: +; SLM-NEXT: xorpd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: xorpd (%rdi), %xmm0 # sched: [4:1.00] +; SLM-NEXT: addpd %xmm1, %xmm0 # sched: [3:1.00] +; SLM-NEXT: retq # sched: [4:1.00] +; +; SANDY-LABEL: test_xorpd: +; SANDY: # BB#0: +; SANDY-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.33] +; SANDY-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:0.50] +; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_xorpd: +; HASWELL: # BB#0: +; HASWELL-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:1.00] +; HASWELL-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [5:1.00] +; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_xorpd: +; BTVER2: # BB#0: +; BTVER2-NEXT: vxorpd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; BTVER2-NEXT: vxorpd (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: retq # sched: [4:1.00] + %1 = bitcast <2 x double> %a0 to <4 x i32> + %2 = bitcast <2 x double> %a1 to <4 x i32> + %3 = xor <4 x i32> %1, %2 + %4 = load <2 x double>, <2 x double> *%a2, align 16 + %5 = bitcast <2 x double> %4 to <4 x i32> + %6 = xor <4 x i32> %3, %5 + %7 = bitcast <4 x i32> %6 to <2 x double> + %8 = fadd <2 x double> %a1, %7 + ret <2 x double> %8 +} + +!0 = !{i32 1} -- 2.40.0