From 3e6b0a643800c67d4e9bad62f50f22689bc392e3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 30 Oct 2017 18:37:27 +0000 Subject: [PATCH] [X86][AVX512] Cleanup scheduler tests - split GENERIC and SKX targets git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@316938 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/avx512-schedule.ll | 9035 ++++++---- test/CodeGen/X86/avx512-shuffle-schedule.ll | 16598 ++++++++++++------ 2 files changed, 16710 insertions(+), 8923 deletions(-) diff --git a/test/CodeGen/X86/avx512-schedule.ll b/test/CodeGen/X86/avx512-schedule.ll index 7a47d20186f..2a0736f51fc 100755 --- a/test/CodeGen/X86/avx512-schedule.ll +++ b/test/CodeGen/X86/avx512-schedule.ll @@ -1,62 +1,94 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+avx512bw,+avx512vl | FileCheck %s --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=SKX + ; This test is an assembly of avx512 instructions to check their scheduling define <8 x double> @addpd512(<8 x double> %y, <8 x double> %x) { -; CHECK-LABEL: addpd512: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: addpd512: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: addpd512: +; SKX: # BB#0: # %entry +; SKX-NEXT: vaddpd %zmm0, %zmm1, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %add.i = fadd <8 x double> %x, %y ret <8 x double> %add.i } define <8 x double> @addpd512fold(<8 x double> %y) { -; CHECK-LABEL: addpd512fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: addpd512fold: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: addpd512fold: +; SKX: # BB#0: # %entry +; SKX-NEXT: vaddpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] +; SKX-NEXT: retq # sched: [7:1.00] entry: %add.i = fadd <8 x double> %y, ret <8 x double> %add.i } define <16 x float> @addps512(<16 x float> %y, <16 x float> %x) { -; CHECK-LABEL: addps512: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: addps512: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: addps512: +; SKX: # BB#0: # %entry +; SKX-NEXT: vaddps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %add.i = fadd <16 x float> %x, %y ret <16 x float> %add.i } define <16 x float> @addps512fold(<16 x float> %y) { -; CHECK-LABEL: addps512fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: addps512fold: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: addps512fold: +; SKX: # BB#0: # %entry +; SKX-NEXT: vaddps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] +; SKX-NEXT: retq # sched: [7:1.00] entry: %add.i = fadd <16 x float> %y, ret <16 x float> %add.i } define <8 x double> @subpd512(<8 x double> %y, <8 x double> %x) { -; CHECK-LABEL: subpd512: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vsubpd %zmm0, %zmm1, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: subpd512: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vsubpd %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: subpd512: +; SKX: # BB#0: # %entry +; SKX-NEXT: vsubpd %zmm0, %zmm1, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %sub.i = fsub <8 x double> %x, %y ret <8 x double> %sub.i } define <8 x double> @subpd512fold(<8 x double> %y, <8 x double>* %x) { -; CHECK-LABEL: subpd512fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vsubpd (%rdi), %zmm0, %zmm0 # sched: [11:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: subpd512fold: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vsubpd (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: subpd512fold: +; SKX: # BB#0: # %entry +; SKX-NEXT: vsubpd (%rdi), %zmm0, %zmm0 # sched: [11:0.50] +; SKX-NEXT: retq # sched: [7:1.00] entry: %tmp2 = load <8 x double>, <8 x double>* %x, align 8 %sub.i = fsub <8 x double> %y, %tmp2 @@ -64,20 +96,30 @@ entry: } define <16 x float> @subps512(<16 x float> %y, <16 x float> %x) { -; CHECK-LABEL: subps512: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vsubps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: subps512: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vsubps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: subps512: +; SKX: # BB#0: # %entry +; SKX-NEXT: vsubps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %sub.i = fsub <16 x float> %x, %y ret <16 x float> %sub.i } define <16 x float> @subps512fold(<16 x float> %y, <16 x float>* %x) { -; CHECK-LABEL: subps512fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vsubps (%rdi), %zmm0, %zmm0 # sched: [11:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: subps512fold: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vsubps (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: subps512fold: +; SKX: # BB#0: # %entry +; SKX-NEXT: vsubps (%rdi), %zmm0, %zmm0 # sched: [11:0.50] +; SKX-NEXT: retq # sched: [7:1.00] entry: %tmp2 = load <16 x float>, <16 x float>* %x, align 4 %sub.i = fsub <16 x float> %y, %tmp2 @@ -85,157 +127,220 @@ entry: } define <8 x i64> @imulq512(<8 x i64> %y, <8 x i64> %x) { -; CHECK-LABEL: imulq512: -; CHECK: # BB#0: -; CHECK-NEXT: vpmullq %zmm0, %zmm1, %zmm0 # sched: [12:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: imulq512: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmullq %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: imulq512: ; SKX: # BB#0: -; SKX-NEXT: vpmullq %zmm0, %zmm1, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: vpmullq %zmm0, %zmm1, %zmm0 # sched: [12:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %z = mul <8 x i64>%x, %y ret <8 x i64>%z } define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) { -; CHECK-LABEL: imulq256: -; CHECK: # BB#0: -; CHECK-NEXT: vpmullq %ymm0, %ymm1, %ymm0 # sched: [12:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: imulq256: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmullq %ymm0, %ymm1, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: imulq256: ; SKX: # BB#0: -; SKX-NEXT: vpmullq %ymm0, %ymm1, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: vpmullq %ymm0, %ymm1, %ymm0 # sched: [12:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %z = mul <4 x i64>%x, %y ret <4 x i64>%z } define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { -; CHECK-LABEL: imulq128: -; CHECK: # BB#0: -; CHECK-NEXT: vpmullq %xmm0, %xmm1, %xmm0 # sched: [12:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: imulq128: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmullq %xmm0, %xmm1, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: imulq128: ; SKX: # BB#0: -; SKX-NEXT: vpmullq %xmm0, %xmm1, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: vpmullq %xmm0, %xmm1, %xmm0 # sched: [12:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %z = mul <2 x i64>%x, %y ret <2 x i64>%z } define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) { -; CHECK-LABEL: mulpd512: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vmulpd %zmm0, %zmm1, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mulpd512: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vmulpd %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mulpd512: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmulpd %zmm0, %zmm1, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %mul.i = fmul <8 x double> %x, %y ret <8 x double> %mul.i } define <8 x double> @mulpd512fold(<8 x double> %y) { -; CHECK-LABEL: mulpd512fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mulpd512fold: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mulpd512fold: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmulpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] +; SKX-NEXT: retq # sched: [7:1.00] entry: %mul.i = fmul <8 x double> %y, ret <8 x double> %mul.i } define <16 x float> @mulps512(<16 x float> %y, <16 x float> %x) { -; CHECK-LABEL: mulps512: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mulps512: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vmulps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mulps512: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %mul.i = fmul <16 x float> %x, %y ret <16 x float> %mul.i } define <16 x float> @mulps512fold(<16 x float> %y) { -; CHECK-LABEL: mulps512fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mulps512fold: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mulps512fold: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmulps {{.*}}(%rip), %zmm0, %zmm0 # sched: [11:0.50] +; SKX-NEXT: retq # sched: [7:1.00] entry: %mul.i = fmul <16 x float> %y, ret <16 x float> %mul.i } define <8 x double> @divpd512(<8 x double> %y, <8 x double> %x) { -; CHECK-LABEL: divpd512: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vdivpd %zmm0, %zmm1, %zmm0 # sched: [23:2.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: divpd512: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vdivpd %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: divpd512: +; SKX: # BB#0: # %entry +; SKX-NEXT: vdivpd %zmm0, %zmm1, %zmm0 # sched: [23:2.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %div.i = fdiv <8 x double> %x, %y ret <8 x double> %div.i } define <8 x double> @divpd512fold(<8 x double> %y) { -; CHECK-LABEL: divpd512fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [30:2.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: divpd512fold: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: divpd512fold: +; SKX: # BB#0: # %entry +; SKX-NEXT: vdivpd {{.*}}(%rip), %zmm0, %zmm0 # sched: [30:2.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %div.i = fdiv <8 x double> %y, ret <8 x double> %div.i } define <16 x float> @divps512(<16 x float> %y, <16 x float> %x) { -; CHECK-LABEL: divps512: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [23:2.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: divps512: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vdivps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: divps512: +; SKX: # BB#0: # %entry +; SKX-NEXT: vdivps %zmm0, %zmm1, %zmm0 # sched: [23:2.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %div.i = fdiv <16 x float> %x, %y ret <16 x float> %div.i } define <16 x float> @divps512fold(<16 x float> %y) { -; CHECK-LABEL: divps512fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0 # sched: [24:2.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: divps512fold: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: divps512fold: +; SKX: # BB#0: # %entry +; SKX-NEXT: vdivps {{.*}}(%rip), %zmm0, %zmm0 # sched: [24:2.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %div.i = fdiv <16 x float> %y, ret <16 x float> %div.i } define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone { -; CHECK-LABEL: vpaddq_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddq_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddq_test: +; SKX: # BB#0: +; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %x = add <8 x i64> %i, %j ret <8 x i64> %x } define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind { -; CHECK-LABEL: vpaddq_fold_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddq_fold_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpaddq (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddq_fold_test: +; SKX: # BB#0: +; SKX-NEXT: vpaddq (%rdi), %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %tmp = load <8 x i64>, <8 x i64>* %j, align 4 %x = add <8 x i64> %i, %tmp ret <8 x i64> %x } define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind { -; CHECK-LABEL: vpaddq_broadcast_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddq_broadcast_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddq_broadcast_test: +; SKX: # BB#0: +; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %x = add <8 x i64> %i, ret <8 x i64> %x } define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind { -; CHECK-LABEL: vpaddq_broadcast2_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddq_broadcast2_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddq_broadcast2_test: +; SKX: # BB#0: +; SKX-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %tmp = load i64, i64* %j %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0 %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1 @@ -250,40 +355,62 @@ define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind { } define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { -; CHECK-LABEL: vpaddd_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddd_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddd_test: +; SKX: # BB#0: +; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %x = add <16 x i32> %i, %j ret <16 x i32> %x } define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind { -; CHECK-LABEL: vpaddd_fold_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddd_fold_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddd_fold_test: +; SKX: # BB#0: +; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %tmp = load <16 x i32>, <16 x i32>* %j, align 4 %x = add <16 x i32> %i, %tmp ret <16 x i32> %x } define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind { -; CHECK-LABEL: vpaddd_broadcast_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddd_broadcast_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddd_broadcast_test: +; SKX: # BB#0: +; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %x = add <16 x i32> %i, ret <16 x i32> %x } define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone { -; CHECK-LABEL: vpaddd_mask_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddd_mask_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddd_mask_test: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, %j %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i @@ -291,12 +418,19 @@ define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %ma } define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone { -; CHECK-LABEL: vpaddd_maskz_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddd_maskz_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddd_maskz_test: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpaddd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, %j %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer @@ -304,12 +438,19 @@ define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %m } define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone { -; CHECK-LABEL: vpaddd_mask_fold_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddd_mask_fold_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddd_mask_fold_test: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %j = load <16 x i32>, <16 x i32>* %j.ptr %x = add <16 x i32> %i, %j @@ -318,12 +459,19 @@ define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 } define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone { -; CHECK-LABEL: vpaddd_mask_broadcast_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddd_mask_broadcast_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddd_mask_broadcast_test: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i @@ -331,12 +479,19 @@ define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) } define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone { -; CHECK-LABEL: vpaddd_maskz_fold_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddd_maskz_fold_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddd_maskz_fold_test: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpaddd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %j = load <16 x i32>, <16 x i32>* %j.ptr %x = add <16 x i32> %i, %j @@ -345,12 +500,19 @@ define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 } define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone { -; CHECK-LABEL: vpaddd_maskz_broadcast_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpaddd_maskz_broadcast_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpaddd_maskz_broadcast_test: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = add <16 x i32> %i, %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer @@ -358,38 +520,58 @@ define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) } define <8 x i64> @vpsubq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone { -; CHECK-LABEL: vpsubq_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpsubq_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsubq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpsubq_test: +; SKX: # BB#0: +; SKX-NEXT: vpsubq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %x = sub <8 x i64> %i, %j ret <8 x i64> %x } define <16 x i32> @vpsubd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone { -; CHECK-LABEL: vpsubd_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpsubd_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpsubd_test: +; SKX: # BB#0: +; SKX-NEXT: vpsubd %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %x = sub <16 x i32> %i, %j ret <16 x i32> %x } define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) { -; CHECK-LABEL: vpmulld_test: -; CHECK: # BB#0: -; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm0 # sched: [8:0.67] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpmulld_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmulld %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpmulld_test: +; SKX: # BB#0: +; SKX-NEXT: vpmulld %zmm1, %zmm0, %zmm0 # sched: [8:0.67] +; SKX-NEXT: retq # sched: [7:1.00] %x = mul <16 x i32> %i, %j ret <16 x i32> %x } declare float @sqrtf(float) readnone define float @sqrtA(float %a) nounwind uwtable readnone ssp { -; CHECK-LABEL: sqrtA: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sqrtA: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [114:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sqrtA: +; SKX: # BB#0: # %entry +; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %conv1 = tail call float @sqrtf(float %a) nounwind readnone ret float %conv1 @@ -397,10 +579,15 @@ entry: declare double @sqrt(double) readnone define double @sqrtB(double %a) nounwind uwtable readnone ssp { -; CHECK-LABEL: sqrtB: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [18:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sqrtB: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [21:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sqrtB: +; SKX: # BB#0: # %entry +; SKX-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 # sched: [18:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %call = tail call double @sqrt(double %a) nounwind readnone ret double %call @@ -408,74 +595,101 @@ entry: declare float @llvm.sqrt.f32(float) define float @sqrtC(float %a) nounwind { -; CHECK-LABEL: sqrtC: -; CHECK: # BB#0: -; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sqrtC: +; GENERIC: # BB#0: +; GENERIC-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [114:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sqrtC: +; SKX: # BB#0: +; SKX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = call float @llvm.sqrt.f32(float %a) ret float %b } declare <16 x float> @llvm.sqrt.v16f32(<16 x float>) define <16 x float> @sqrtD(<16 x float> %a) nounwind { -; CHECK-LABEL: sqrtD: -; CHECK: # BB#0: -; CHECK-NEXT: vsqrtps %zmm0, %zmm0 # sched: [19:2.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sqrtD: +; GENERIC: # BB#0: +; GENERIC-NEXT: vsqrtps %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sqrtD: +; SKX: # BB#0: +; SKX-NEXT: vsqrtps %zmm0, %zmm0 # sched: [19:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a) ret <16 x float> %b } declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) define <8 x double> @sqrtE(<8 x double> %a) nounwind { -; CHECK-LABEL: sqrtE: -; CHECK: # BB#0: -; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 # sched: [31:2.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sqrtE: +; GENERIC: # BB#0: +; GENERIC-NEXT: vsqrtpd %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sqrtE: +; SKX: # BB#0: +; SKX-NEXT: vsqrtpd %zmm0, %zmm0 # sched: [31:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a) ret <8 x double> %b } define <16 x float> @fadd_broadcast(<16 x float> %a) nounwind { -; CHECK-LABEL: fadd_broadcast: -; CHECK: # BB#0: -; CHECK-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [11:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: fadd_broadcast: +; GENERIC: # BB#0: +; GENERIC-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: fadd_broadcast: +; SKX: # BB#0: +; SKX-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [11:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %b = fadd <16 x float> %a, ret <16 x float> %b } define <8 x i64> @addq_broadcast(<8 x i64> %a) nounwind { -; CHECK-LABEL: addq_broadcast: -; CHECK: # BB#0: -; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: addq_broadcast: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: addq_broadcast: +; SKX: # BB#0: +; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %b = add <8 x i64> %a, ret <8 x i64> %b } define <8 x i64> @orq_broadcast(<8 x i64> %a) nounwind { -; CHECK-LABEL: orq_broadcast: -; CHECK: # BB#0: -; CHECK-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: orq_broadcast: +; GENERIC: # BB#0: +; GENERIC-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: orq_broadcast: ; SKX: # BB#0: -; SKX-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: vorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %b = or <8 x i64> %a, ret <8 x i64> %b } define <16 x i32> @andd512fold(<16 x i32> %y, <16 x i32>* %x) { -; CHECK-LABEL: andd512fold: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vandps (%rdi), %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: andd512fold: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vandps (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: andd512fold: ; SKX: # BB#0: # %entry -; SKX-NEXT: vandps (%rdi), %zmm0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: vandps (%rdi), %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] entry: %a = load <16 x i32>, <16 x i32>* %x, align 4 %b = and <16 x i32> %y, %a @@ -483,14 +697,15 @@ entry: } define <8 x i64> @andqbrst(<8 x i64> %p1, i64* %ap) { -; CHECK-LABEL: andqbrst: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: andqbrst: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: andqbrst: ; SKX: # BB#0: # %entry -; SKX-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: vandpd (%rdi){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] entry: %a = load i64, i64* %ap, align 8 %b = insertelement <8 x i64> undef, i64 %a, i32 0 @@ -500,12 +715,19 @@ entry: } define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i, -; CHECK-LABEL: test_mask_vaddps: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mask_vaddps: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_mask_vaddps: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vaddps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] <16 x float> %j, <16 x i32> %mask1) nounwind readnone { %mask = icmp ne <16 x i32> %mask1, zeroinitializer @@ -514,30 +736,40 @@ define <16 x float> @test_mask_vaddps(<16 x float> %dst, <16 x float> %i, ret <16 x float> %r } -define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, -; CHECK-LABEL: test_mask_vmulps: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] - <16 x float> %j, <16 x i32> %mask1) - nounwind readnone { +define <16 x float> @test_mask_vmulps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { +; GENERIC-LABEL: test_mask_vmulps: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_mask_vmulps: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vmulps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = fmul <16 x float> %i, %j %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst ret <16 x float> %r } -define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, -; CHECK-LABEL: test_mask_vminps: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] - <16 x float> %j, <16 x i32> %mask1) - nounwind readnone { +define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { +; GENERIC-LABEL: test_mask_vminps: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_mask_vminps: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vminps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %cmp_res = fcmp olt <16 x float> %i, %j %min = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j @@ -545,21 +777,20 @@ define <16 x float> @test_mask_vminps(<16 x float> %dst, <16 x float> %i, ret <16 x float> %r } -define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, -; CHECK-LABEL: test_mask_vminpd: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i32> %mask1) nounwind readnone { +; GENERIC-LABEL: test_mask_vminpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mask_vminpd: ; SKX: # BB#0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 -; SKX-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} -; SKX-NEXT: retq - <8 x double> %j, <8 x i32> %mask1) - nounwind readnone { +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vminpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i32> %mask1, zeroinitializer %cmp_res = fcmp olt <8 x double> %i, %j %min = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j @@ -567,15 +798,20 @@ define <8 x double> @test_mask_vminpd(<8 x double> %dst, <8 x double> %i, ret <8 x double> %r } -define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, -; CHECK-LABEL: test_mask_vmaxps: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] - <16 x float> %j, <16 x i32> %mask1) - nounwind readnone { +define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { +; GENERIC-LABEL: test_mask_vmaxps: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_mask_vmaxps: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vmaxps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %cmp_res = fcmp ogt <16 x float> %i, %j %max = select <16 x i1> %cmp_res, <16 x float> %i, <16 x float> %j @@ -583,21 +819,20 @@ define <16 x float> @test_mask_vmaxps(<16 x float> %dst, <16 x float> %i, ret <16 x float> %r } -define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, -; CHECK-LABEL: test_mask_vmaxpd: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i32> %mask1) nounwind readnone { +; GENERIC-LABEL: test_mask_vmaxpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mask_vmaxpd: ; SKX: # BB#0: -; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 -; SKX-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} -; SKX-NEXT: retq - <8 x double> %j, <8 x i32> %mask1) - nounwind readnone { +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vmaxpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i32> %mask1, zeroinitializer %cmp_res = fcmp ogt <8 x double> %i, %j %max = select <8 x i1> %cmp_res, <8 x double> %i, <8 x double> %j @@ -605,74 +840,100 @@ define <8 x double> @test_mask_vmaxpd(<8 x double> %dst, <8 x double> %i, ret <8 x double> %r } -define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, -; CHECK-LABEL: test_mask_vsubps: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] - <16 x float> %j, <16 x i32> %mask1) - nounwind readnone { +define <16 x float> @test_mask_vsubps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { +; GENERIC-LABEL: test_mask_vsubps: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_mask_vsubps: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vsubps %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = fsub <16 x float> %i, %j %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst ret <16 x float> %r } -define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, -; CHECK-LABEL: test_mask_vdivps: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [23:2.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] - <16 x float> %j, <16 x i32> %mask1) - nounwind readnone { +define <16 x float> @test_mask_vdivps(<16 x float> %dst, <16 x float> %i, <16 x float> %j, <16 x i32> %mask1) nounwind readnone { +; GENERIC-LABEL: test_mask_vdivps: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_mask_vdivps: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vdivps %zmm2, %zmm1, %zmm0 {%k1} # sched: [23:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %x = fdiv <16 x float> %i, %j %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %dst ret <16 x float> %r } -define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, -; CHECK-LABEL: test_mask_vaddpd: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] - <8 x double> %j, <8 x i64> %mask1) - nounwind readnone { +define <8 x double> @test_mask_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x double> %j, <8 x i64> %mask1) nounwind readnone { +; GENERIC-LABEL: test_mask_vaddpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_mask_vaddpd: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vaddpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %x = fadd <8 x double> %i, %j %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> %dst ret <8 x double> %r } -define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, -; CHECK-LABEL: test_maskz_vaddpd: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] - <8 x i64> %mask1) nounwind readnone { +define <8 x double> @test_maskz_vaddpd(<8 x double> %i, <8 x double> %j, <8 x i64> %mask1) nounwind readnone { +; GENERIC-LABEL: test_maskz_vaddpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_maskz_vaddpd: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vaddpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %x = fadd <8 x double> %i, %j %r = select <8 x i1> %mask, <8 x double> %x, <8 x double> zeroinitializer ret <8 x double> %r } -define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, -; CHECK-LABEL: test_mask_fold_vaddpd: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [11:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] - <8 x double>* %j, <8 x i64> %mask1) - nounwind { +define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, <8 x double>* %j, <8 x i64> %mask1) nounwind { +; GENERIC-LABEL: test_mask_fold_vaddpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_mask_fold_vaddpd: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vaddpd (%rdi), %zmm1, %zmm0 {%k1} # sched: [11:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %tmp = load <8 x double>, <8 x double>* %j, align 8 %x = fadd <8 x double> %i, %tmp @@ -680,14 +941,20 @@ define <8 x double> @test_mask_fold_vaddpd(<8 x double> %dst, <8 x double> %i, ret <8 x double> %r } -define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, -; CHECK-LABEL: test_maskz_fold_vaddpd: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] - <8 x i64> %mask1) nounwind { +define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, <8 x i64> %mask1) nounwind { +; GENERIC-LABEL: test_maskz_fold_vaddpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_maskz_fold_vaddpd: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vaddpd (%rdi), %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %tmp = load <8 x double>, <8 x double>* %j, align 8 %x = fadd <8 x double> %i, %tmp @@ -696,10 +963,15 @@ define <8 x double> @test_maskz_fold_vaddpd(<8 x double> %i, <8 x double>* %j, } define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind { -; CHECK-LABEL: test_broadcast_vaddpd: -; CHECK: # BB#0: -; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 # sched: [11:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_broadcast_vaddpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_broadcast_vaddpd: +; SKX: # BB#0: +; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 # sched: [11:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %tmp = load double, double* %j %b = insertelement <8 x double> undef, double %tmp, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, @@ -708,15 +980,22 @@ define <8 x double> @test_broadcast_vaddpd(<8 x double> %i, double* %j) nounwind ret <8 x double> %x } -define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i, -; CHECK-LABEL: test_mask_broadcast_vaddpd: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [11:0.50] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] - double* %j, <8 x i64> %mask1) nounwind { +define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> %i, double* %j, <8 x i64> %mask1) nounwind { +; GENERIC-LABEL: test_mask_broadcast_vaddpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 +; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_mask_broadcast_vaddpd: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} # sched: [11:0.50] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %tmp = load double, double* %j %b = insertelement <8 x double> undef, double %tmp, i32 0 @@ -728,12 +1007,19 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> } define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j, -; CHECK-LABEL: test_maskz_broadcast_vaddpd: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_maskz_broadcast_vaddpd: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_maskz_broadcast_vaddpd: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vaddpd (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} # sched: [11:0.50] +; SKX-NEXT: retq # sched: [7:1.00] <8 x i64> %mask1) nounwind { %mask = icmp ne <8 x i64> %mask1, zeroinitializer %tmp = load double, double* %j @@ -746,41 +1032,44 @@ define <8 x double> @test_maskz_broadcast_vaddpd(<8 x double> %i, double* %j, } define <16 x float> @test_fxor(<16 x float> %a) { -; CHECK-LABEL: test_fxor: -; CHECK: # BB#0: -; CHECK-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_fxor: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_fxor: ; SKX: # BB#0: -; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %res = fsub <16 x float> , %a ret <16 x float>%res } define <8 x float> @test_fxor_8f32(<8 x float> %a) { -; CHECK-LABEL: test_fxor_8f32: -; CHECK: # BB#0: -; CHECK-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_fxor_8f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_fxor_8f32: ; SKX: # BB#0: -; SKX-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: vxorps {{.*}}(%rip){1to8}, %ymm0, %ymm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %res = fsub <8 x float> , %a ret <8 x float>%res } define <8 x double> @fabs_v8f64(<8 x double> %p) -; CHECK-LABEL: fabs_v8f64: -; CHECK: # BB#0: -; CHECK-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: fabs_v8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: fabs_v8f64: ; SKX: # BB#0: -; SKX-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: vandpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] { %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p) ret <8 x double> %t @@ -788,14 +1077,15 @@ define <8 x double> @fabs_v8f64(<8 x double> %p) declare <8 x double> @llvm.fabs.v8f64(<8 x double> %p) define <16 x float> @fabs_v16f32(<16 x float> %p) -; CHECK-LABEL: fabs_v16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: fabs_v16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: fabs_v16f32: ; SKX: # BB#0: -; SKX-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: vandps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] { %t = call <16 x float> @llvm.fabs.v16f32(<16 x float> %p) ret <16 x float> %t @@ -803,17 +1093,29 @@ define <16 x float> @fabs_v16f32(<16 x float> %p) declare <16 x float> @llvm.fabs.v16f32(<16 x float> %p) define double @test1(double %a, double %b) nounwind { -; CHECK-LABEL: test1: -; CHECK: # BB#0: -; CHECK-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00] -; CHECK-NEXT: jne .LBB64_1 # sched: [1:0.50] -; CHECK-NEXT: jnp .LBB64_2 # sched: [1:0.50] -; CHECK-NEXT: .LBB64_1: # %l1 -; CHECK-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; CHECK-NEXT: .LBB64_2: # %l2 -; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00] +; GENERIC-NEXT: jne .LBB64_1 # sched: [1:1.00] +; GENERIC-NEXT: jnp .LBB64_2 # sched: [1:1.00] +; GENERIC-NEXT: .LBB64_1: # %l1 +; GENERIC-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; GENERIC-NEXT: .LBB64_2: # %l2 +; GENERIC-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test1: +; SKX: # BB#0: +; SKX-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00] +; SKX-NEXT: jne .LBB64_1 # sched: [1:0.50] +; SKX-NEXT: jnp .LBB64_2 # sched: [1:0.50] +; SKX-NEXT: .LBB64_1: # %l1 +; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] +; SKX-NEXT: .LBB64_2: # %l2 +; SKX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %tobool = fcmp une double %a, %b br i1 %tobool, label %l1, label %l2 @@ -826,16 +1128,27 @@ l2: } define float @test2(float %a, float %b) nounwind { -; CHECK-LABEL: test2: -; CHECK: # BB#0: -; CHECK-NEXT: vucomiss %xmm0, %xmm1 # sched: [2:1.00] -; CHECK-NEXT: jbe .LBB65_2 # sched: [1:0.50] -; CHECK-NEXT: # BB#1: # %l1 -; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; CHECK-NEXT: .LBB65_2: # %l2 -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vucomiss %xmm0, %xmm1 # sched: [2:1.00] +; GENERIC-NEXT: jbe .LBB65_2 # sched: [1:1.00] +; GENERIC-NEXT: # BB#1: # %l1 +; GENERIC-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; GENERIC-NEXT: .LBB65_2: # %l2 +; GENERIC-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2: +; SKX: # BB#0: +; SKX-NEXT: vucomiss %xmm0, %xmm1 # sched: [2:1.00] +; SKX-NEXT: jbe .LBB65_2 # sched: [1:0.50] +; SKX-NEXT: # BB#1: # %l1 +; SKX-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] +; SKX-NEXT: .LBB65_2: # %l2 +; SKX-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %tobool = fcmp olt float %a, %b br i1 %tobool, label %l1, label %l2 @@ -848,18 +1161,19 @@ l2: } define i32 @test3(float %a, float %b) { -; CHECK-LABEL: test3: -; CHECK: # BB#0: -; CHECK-NEXT: vcmpeqss %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: movzbl %al, %eax # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcmpeqss %xmm1, %xmm0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: movzbl %al, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test3: -; SKX: ## BB#0: +; SKX: # BB#0: ; SKX-NEXT: vcmpeqss %xmm1, %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: movzbl %al, %eax -; SKX-NEXT: retq +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: movzbl %al, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %cmp10.i = fcmp oeq float %a, %b %conv11.i = zext i1 %cmp10.i to i32 @@ -867,19 +1181,33 @@ define i32 @test3(float %a, float %b) { } define float @test5(float %p) #0 { -; CHECK-LABEL: test5: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vucomiss %xmm1, %xmm0 # sched: [2:1.00] -; CHECK-NEXT: jne .LBB67_1 # sched: [1:0.50] -; CHECK-NEXT: jp .LBB67_1 # sched: [1:0.50] -; CHECK-NEXT: # BB#2: # %return -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; CHECK-NEXT: .LBB67_1: # %if.end -; CHECK-NEXT: seta %al # sched: [2:1.00] -; CHECK-NEXT: movzbl %al, %eax # sched: [1:0.25] -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test5: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vucomiss %xmm1, %xmm0 # sched: [2:1.00] +; GENERIC-NEXT: jne .LBB67_1 # sched: [1:1.00] +; GENERIC-NEXT: jp .LBB67_1 # sched: [1:1.00] +; GENERIC-NEXT: # BB#2: # %return +; GENERIC-NEXT: retq # sched: [1:1.00] +; GENERIC-NEXT: .LBB67_1: # %if.end +; GENERIC-NEXT: seta %al # sched: [2:1.00] +; GENERIC-NEXT: movzbl %al, %eax # sched: [1:0.33] +; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test5: +; SKX: # BB#0: # %entry +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vucomiss %xmm1, %xmm0 # sched: [2:1.00] +; SKX-NEXT: jne .LBB67_1 # sched: [1:0.50] +; SKX-NEXT: jp .LBB67_1 # sched: [1:0.50] +; SKX-NEXT: # BB#2: # %return +; SKX-NEXT: retq # sched: [7:1.00] +; SKX-NEXT: .LBB67_1: # %if.end +; SKX-NEXT: seta %al # sched: [2:1.00] +; SKX-NEXT: movzbl %al, %eax # sched: [1:0.25] +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] +; SKX-NEXT: retq # sched: [7:1.00] entry: %cmp = fcmp oeq float %p, 0.000000e+00 br i1 %cmp, label %return, label %if.end @@ -895,24 +1223,38 @@ return: ; preds = %if.end, %entry } define i32 @test6(i32 %a, i32 %b) { -; CHECK-LABEL: test6: -; CHECK: # BB#0: -; CHECK-NEXT: xorl %eax, %eax # sched: [1:0.25] -; CHECK-NEXT: cmpl %esi, %edi # sched: [1:0.25] -; CHECK-NEXT: sete %al # sched: [1:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test6: +; GENERIC: # BB#0: +; GENERIC-NEXT: xorl %eax, %eax # sched: [1:0.33] +; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33] +; GENERIC-NEXT: sete %al # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test6: +; SKX: # BB#0: +; SKX-NEXT: xorl %eax, %eax # sched: [1:0.25] +; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25] +; SKX-NEXT: sete %al # sched: [1:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %cmp = icmp eq i32 %a, %b %res = zext i1 %cmp to i32 ret i32 %res } define i32 @test7(double %x, double %y) #2 { -; CHECK-LABEL: test7: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: xorl %eax, %eax # sched: [1:0.25] -; CHECK-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00] -; CHECK-NEXT: setne %al # sched: [1:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test7: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: xorl %eax, %eax # sched: [1:0.33] +; GENERIC-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00] +; GENERIC-NEXT: setne %al # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test7: +; SKX: # BB#0: # %entry +; SKX-NEXT: xorl %eax, %eax # sched: [1:0.25] +; SKX-NEXT: vucomisd %xmm1, %xmm0 # sched: [2:1.00] +; SKX-NEXT: setne %al # sched: [1:0.50] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = fcmp one double %x, %y %or = zext i1 %0 to i32 @@ -920,17 +1262,29 @@ entry: } define i32 @test8(i32 %a1, i32 %a2, i32 %a3) { -; CHECK-LABEL: test8: -; CHECK: # BB#0: -; CHECK-NEXT: notl %edi # sched: [1:0.25] -; CHECK-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 -; CHECK-NEXT: # sched: [1:0.25] -; CHECK-NEXT: testl %edx, %edx # sched: [1:0.25] -; CHECK-NEXT: movl $1, %eax # sched: [1:0.25] -; CHECK-NEXT: cmovel %eax, %edx # sched: [1:0.50] -; CHECK-NEXT: orl %edi, %esi # sched: [1:0.25] -; CHECK-NEXT: cmovnel %edx, %eax # sched: [1:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test8: +; GENERIC: # BB#0: +; GENERIC-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; GENERIC-NEXT: # sched: [1:0.33] +; GENERIC-NEXT: testl %edx, %edx # sched: [1:0.33] +; GENERIC-NEXT: movl $1, %eax # sched: [1:0.33] +; GENERIC-NEXT: cmovel %eax, %edx # sched: [2:0.67] +; GENERIC-NEXT: notl %edi # sched: [1:0.33] +; GENERIC-NEXT: orl %edi, %esi # sched: [1:0.33] +; GENERIC-NEXT: cmovnel %edx, %eax # sched: [2:0.67] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test8: +; SKX: # BB#0: +; SKX-NEXT: notl %edi # sched: [1:0.25] +; SKX-NEXT: xorl $-2147483648, %esi # imm = 0x80000000 +; SKX-NEXT: # sched: [1:0.25] +; SKX-NEXT: testl %edx, %edx # sched: [1:0.25] +; SKX-NEXT: movl $1, %eax # sched: [1:0.25] +; SKX-NEXT: cmovel %eax, %edx # sched: [1:0.50] +; SKX-NEXT: orl %edi, %esi # sched: [1:0.25] +; SKX-NEXT: cmovnel %edx, %eax # sched: [1:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %tmp1 = icmp eq i32 %a1, -1 %tmp2 = icmp eq i32 %a2, -2147483648 %tmp3 = and i1 %tmp1, %tmp2 @@ -941,16 +1295,27 @@ define i32 @test8(i32 %a1, i32 %a2, i32 %a3) { } define i32 @test9(i64 %a) { -; CHECK-LABEL: test9: -; CHECK: # BB#0: -; CHECK-NEXT: testb $1, %dil # sched: [1:0.25] -; CHECK-NEXT: jne .LBB71_2 # sched: [1:0.50] -; CHECK-NEXT: # BB#1: # %A -; CHECK-NEXT: movl $6, %eax # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; CHECK-NEXT: .LBB71_2: # %B -; CHECK-NEXT: movl $7, %eax # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test9: +; GENERIC: # BB#0: +; GENERIC-NEXT: testb $1, %dil # sched: [1:0.33] +; GENERIC-NEXT: jne .LBB71_2 # sched: [1:1.00] +; GENERIC-NEXT: # BB#1: # %A +; GENERIC-NEXT: movl $6, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; GENERIC-NEXT: .LBB71_2: # %B +; GENERIC-NEXT: movl $7, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test9: +; SKX: # BB#0: +; SKX-NEXT: testb $1, %dil # sched: [1:0.25] +; SKX-NEXT: jne .LBB71_2 # sched: [1:0.50] +; SKX-NEXT: # BB#1: # %A +; SKX-NEXT: movl $6, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; SKX-NEXT: .LBB71_2: # %B +; SKX-NEXT: movl $7, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %b = and i64 %a, 1 %cmp10.i = icmp eq i64 %b, 0 br i1 %cmp10.i, label %A, label %B @@ -961,22 +1326,39 @@ B: } define i32 @test10(i64 %b, i64 %c, i1 %d) { -; CHECK-LABEL: test10: -; CHECK: # BB#0: -; CHECK-NEXT: movl %edx, %eax # sched: [1:0.25] -; CHECK-NEXT: andb $1, %al # sched: [1:0.25] -; CHECK-NEXT: cmpq %rsi, %rdi # sched: [1:0.25] -; CHECK-NEXT: sete %cl # sched: [1:0.50] -; CHECK-NEXT: orb %dl, %cl # sched: [1:0.25] -; CHECK-NEXT: andb $1, %cl # sched: [1:0.25] -; CHECK-NEXT: cmpb %cl, %al # sched: [1:0.25] -; CHECK-NEXT: je .LBB72_1 # sched: [1:0.50] -; CHECK-NEXT: # BB#2: # %if.end.i -; CHECK-NEXT: movl $6, %eax # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; CHECK-NEXT: .LBB72_1: # %if.then.i -; CHECK-NEXT: movl $5, %eax # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test10: +; GENERIC: # BB#0: +; GENERIC-NEXT: movl %edx, %eax # sched: [1:0.33] +; GENERIC-NEXT: andb $1, %al # sched: [1:0.33] +; GENERIC-NEXT: cmpq %rsi, %rdi # sched: [1:0.33] +; GENERIC-NEXT: sete %cl # sched: [1:0.50] +; GENERIC-NEXT: orb %dl, %cl # sched: [1:0.33] +; GENERIC-NEXT: andb $1, %cl # sched: [1:0.33] +; GENERIC-NEXT: cmpb %cl, %al # sched: [1:0.33] +; GENERIC-NEXT: je .LBB72_1 # sched: [1:1.00] +; GENERIC-NEXT: # BB#2: # %if.end.i +; GENERIC-NEXT: movl $6, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; GENERIC-NEXT: .LBB72_1: # %if.then.i +; GENERIC-NEXT: movl $5, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test10: +; SKX: # BB#0: +; SKX-NEXT: movl %edx, %eax # sched: [1:0.25] +; SKX-NEXT: andb $1, %al # sched: [1:0.25] +; SKX-NEXT: cmpq %rsi, %rdi # sched: [1:0.25] +; SKX-NEXT: sete %cl # sched: [1:0.50] +; SKX-NEXT: orb %dl, %cl # sched: [1:0.25] +; SKX-NEXT: andb $1, %cl # sched: [1:0.25] +; SKX-NEXT: cmpb %cl, %al # sched: [1:0.25] +; SKX-NEXT: je .LBB72_1 # sched: [1:0.50] +; SKX-NEXT: # BB#2: # %if.end.i +; SKX-NEXT: movl $6, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] +; SKX-NEXT: .LBB72_1: # %if.then.i +; SKX-NEXT: movl $5, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %cmp8.i = icmp eq i64 %b, %c %or1 = or i1 %d, %cmp8.i @@ -991,294 +1373,388 @@ if.end.i: } define <16 x float> @sitof32(<16 x i32> %a) nounwind { -; CHECK-LABEL: sitof32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sitof32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sitof32: +; SKX: # BB#0: +; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i32> %a to <16 x float> ret <16 x float> %b } define <8 x double> @sltof864(<8 x i64> %a) { -; CHECK-LABEL: sltof864: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sltof864: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtqq2pd %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sltof864: +; SKX: # BB#0: +; SKX-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <8 x i64> %a to <8 x double> ret <8 x double> %b } define <4 x double> @slto4f64(<4 x i64> %a) { -; CHECK-LABEL: slto4f64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: slto4f64: -; VLDQ: # BB#0: -; VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0 -; VLDQ-NEXT: retq +; GENERIC-LABEL: slto4f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtqq2pd %ymm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: slto4f64: +; SKX: # BB#0: +; SKX-NEXT: vcvtqq2pd %ymm0, %ymm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <4 x i64> %a to <4 x double> ret <4 x double> %b } define <2 x double> @slto2f64(<2 x i64> %a) { -; CHECK-LABEL: slto2f64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: slto2f64: -; VLDQ: # BB#0: -; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 -; VLDQ-NEXT: retq +; GENERIC-LABEL: slto2f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: slto2f64: +; SKX: # BB#0: +; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <2 x i64> %a to <2 x double> ret <2 x double> %b } define <2 x float> @sltof2f32(<2 x i64> %a) { -; CHECK-LABEL: sltof2f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0 # sched: [5:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: sltof2f32: -; VLDQ: # BB#0: -; VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0 -; VLDQ-NEXT: retq +; GENERIC-LABEL: sltof2f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtqq2ps %xmm0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sltof2f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtqq2ps %xmm0, %xmm0 # sched: [5:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <2 x i64> %a to <2 x float> ret <2 x float>%b } define <4 x float> @slto4f32_mem(<4 x i64>* %a) { -; CHECK-LABEL: slto4f32_mem: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtqq2psy (%rdi), %xmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: slto4f32_mem: -; VLDQ: # BB#0: -; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0 -; VLDQ-NEXT: retq +; GENERIC-LABEL: slto4f32_mem: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtqq2psy (%rdi), %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: slto4f32_mem: +; SKX: # BB#0: +; SKX-NEXT: vcvtqq2psy (%rdi), %xmm0 +; SKX-NEXT: retq # sched: [7:1.00] %a1 = load <4 x i64>, <4 x i64>* %a, align 8 %b = sitofp <4 x i64> %a1 to <4 x float> ret <4 x float>%b } define <4 x i64> @f64to4sl(<4 x double> %a) { -; CHECK-LABEL: f64to4sl: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: f64to4sl: -; VLDQ: # BB#0: -; VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0 -; VLDQ-NEXT: retq +; GENERIC-LABEL: f64to4sl: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttpd2qq %ymm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64to4sl: +; SKX: # BB#0: +; SKX-NEXT: vcvttpd2qq %ymm0, %ymm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = fptosi <4 x double> %a to <4 x i64> ret <4 x i64> %b } define <4 x i64> @f32to4sl(<4 x float> %a) { -; CHECK-LABEL: f32to4sl: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttps2qq %xmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: f32to4sl: -; VLDQ: # BB#0: -; VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 -; VLDQ-NEXT: retq +; GENERIC-LABEL: f32to4sl: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttps2qq %xmm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f32to4sl: +; SKX: # BB#0: +; SKX-NEXT: vcvttps2qq %xmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = fptosi <4 x float> %a to <4 x i64> ret <4 x i64> %b } define <4 x float> @slto4f32(<4 x i64> %a) { -; CHECK-LABEL: slto4f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0 # sched: [7:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: slto4f32: -; VLDQ: # BB#0: -; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 -; VLDQ-NEXT: vzeroupper -; VLDQ-NEXT: retq +; GENERIC-LABEL: slto4f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtqq2ps %ymm0, %xmm0 +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: slto4f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtqq2ps %ymm0, %xmm0 # sched: [7:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <4 x i64> %a to <4 x float> ret <4 x float> %b } define <4 x float> @ulto4f32(<4 x i64> %a) { -; CHECK-LABEL: ulto4f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0 # sched: [7:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: ulto4f32: -; VLDQ: # BB#0: -; VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 -; VLDQ-NEXT: vzeroupper -; VLDQ-NEXT: retq +; GENERIC-LABEL: ulto4f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtuqq2ps %ymm0, %xmm0 +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ulto4f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtuqq2ps %ymm0, %xmm0 # sched: [7:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <4 x i64> %a to <4 x float> ret <4 x float> %b } define <8 x double> @ulto8f64(<8 x i64> %a) { -; CHECK-LABEL: ulto8f64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtuqq2pd %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ulto8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ulto8f64: +; SKX: # BB#0: +; SKX-NEXT: vcvtuqq2pd %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <8 x i64> %a to <8 x double> ret <8 x double> %b } define <16 x double> @ulto16f64(<16 x i64> %a) { -; CHECK-LABEL: ulto16f64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtuqq2pd %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: vcvtuqq2pd %zmm1, %zmm1 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ulto16f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtuqq2pd %zmm0, %zmm0 +; GENERIC-NEXT: vcvtuqq2pd %zmm1, %zmm1 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ulto16f64: +; SKX: # BB#0: +; SKX-NEXT: vcvtuqq2pd %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: vcvtuqq2pd %zmm1, %zmm1 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i64> %a to <16 x double> ret <16 x double> %b } define <16 x i32> @f64to16si(<16 x float> %a) nounwind { -; CHECK-LABEL: f64to16si: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f64to16si: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttps2dq %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64to16si: +; SKX: # BB#0: +; SKX-NEXT: vcvttps2dq %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = fptosi <16 x float> %a to <16 x i32> ret <16 x i32> %b } define <16 x i32> @f32to16ui(<16 x float> %a) nounwind { -; CHECK-LABEL: f32to16ui: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttps2udq %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f32to16ui: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttps2udq %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f32to16ui: +; SKX: # BB#0: +; SKX-NEXT: vcvttps2udq %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui <16 x float> %a to <16 x i32> ret <16 x i32> %b } define <16 x i8> @f32to16uc(<16 x float> %f) { -; CHECK-LABEL: f32to16uc: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttps2udq %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: vpmovdb %zmm0, %xmm0 # sched: [4:2.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f32to16uc: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttps2udq %zmm0, %zmm0 +; GENERIC-NEXT: vpmovdb %zmm0, %xmm0 +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f32to16uc: +; SKX: # BB#0: +; SKX-NEXT: vcvttps2udq %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: vpmovdb %zmm0, %xmm0 # sched: [4:2.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = fptoui <16 x float> %f to <16 x i8> ret <16 x i8> %res } define <16 x i16> @f32to16us(<16 x float> %f) { -; CHECK-LABEL: f32to16us: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttps2udq %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: vpmovdw %zmm0, %ymm0 # sched: [4:2.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f32to16us: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttps2udq %zmm0, %zmm0 +; GENERIC-NEXT: vpmovdw %zmm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f32to16us: +; SKX: # BB#0: +; SKX-NEXT: vcvttps2udq %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: vpmovdw %zmm0, %ymm0 # sched: [4:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = fptoui <16 x float> %f to <16 x i16> ret <16 x i16> %res } define <8 x i32> @f32to8ui(<8 x float> %a) nounwind { -; CHECK-LABEL: f32to8ui: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f32to8ui: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttps2udq %ymm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f32to8ui: +; SKX: # BB#0: +; SKX-NEXT: vcvttps2udq %ymm0, %ymm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui <8 x float> %a to <8 x i32> ret <8 x i32> %b } define <4 x i32> @f32to4ui(<4 x float> %a) nounwind { -; CHECK-LABEL: f32to4ui: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f32to4ui: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttps2udq %xmm0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f32to4ui: +; SKX: # BB#0: +; SKX-NEXT: vcvttps2udq %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui <4 x float> %a to <4 x i32> ret <4 x i32> %b } define <8 x i32> @f64to8ui(<8 x double> %a) nounwind { -; CHECK-LABEL: f64to8ui: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttpd2udq %zmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f64to8ui: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttpd2udq %zmm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64to8ui: +; SKX: # BB#0: +; SKX-NEXT: vcvttpd2udq %zmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui <8 x double> %a to <8 x i32> ret <8 x i32> %b } define <8 x i16> @f64to8us(<8 x double> %f) { -; CHECK-LABEL: f64to8us: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: vpmovdw %ymm0, %xmm0 # sched: [4:2.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f64to8us: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttpd2dq %zmm0, %ymm0 +; GENERIC-NEXT: vpmovdw %ymm0, %xmm0 +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64to8us: +; SKX: # BB#0: +; SKX-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: vpmovdw %ymm0, %xmm0 # sched: [4:2.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = fptoui <8 x double> %f to <8 x i16> ret <8 x i16> %res } define <8 x i8> @f64to8uc(<8 x double> %f) { -; CHECK-LABEL: f64to8uc: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: vpmovdw %ymm0, %xmm0 # sched: [4:2.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f64to8uc: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttpd2dq %zmm0, %ymm0 +; GENERIC-NEXT: vpmovdw %ymm0, %xmm0 +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64to8uc: +; SKX: # BB#0: +; SKX-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: vpmovdw %ymm0, %xmm0 # sched: [4:2.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = fptoui <8 x double> %f to <8 x i8> ret <8 x i8> %res } define <4 x i32> @f64to4ui(<4 x double> %a) nounwind { -; CHECK-LABEL: f64to4ui: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0 # sched: [7:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f64to4ui: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttpd2udq %ymm0, %xmm0 +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64to4ui: +; SKX: # BB#0: +; SKX-NEXT: vcvttpd2udq %ymm0, %xmm0 # sched: [7:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui <4 x double> %a to <4 x i32> ret <4 x i32> %b } define <8 x double> @sito8f64(<8 x i32> %a) { -; CHECK-LABEL: sito8f64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sito8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sito8f64: +; SKX: # BB#0: +; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <8 x i32> %a to <8 x double> ret <8 x double> %b } define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { -; CHECK-LABEL: i32to8f64_mask: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLBW-LABEL: i32to8f64_mask: -; VLBW: # BB#0: -; VLBW-NEXT: kmovd %edi, %k1 -; VLBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} -; VLBW-NEXT: retq +; GENERIC-LABEL: i32to8f64_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: i32to8f64_mask: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] ; VLNOBW-LABEL: i32to8f64_mask: ; VLNOBW: # BB#0: ; VLNOBW-NEXT: kmovw %edi, %k1 ; VLNOBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} -; VLNOBW-NEXT: retq +; VLNOBW-NEXT: ret{{[l|q]}} %1 = bitcast i8 %c to <8 x i1> %2 = sitofp <8 x i32> %b to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a ret <8 x double> %3 } define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { -; CHECK-LABEL: sito8f64_maskz: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLBW-LABEL: sito8f64_maskz: -; VLBW: # BB#0: -; VLBW-NEXT: kmovd %edi, %k1 -; VLBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} -; VLBW-NEXT: retq +; GENERIC-LABEL: sito8f64_maskz: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sito8f64_maskz: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] ; VLNOBW-LABEL: sito8f64_maskz: ; VLNOBW: # BB#0: ; VLNOBW-NEXT: kmovw %edi, %k1 ; VLNOBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} -; VLNOBW-NEXT: retq +; VLNOBW-NEXT: ret{{[l|q]}} %1 = bitcast i8 %b to <8 x i1> %2 = sitofp <8 x i32> %a to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer @@ -1286,63 +1762,100 @@ define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { } define <8 x i32> @f64to8si(<8 x double> %a) { -; CHECK-LABEL: f64to8si: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f64to8si: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttpd2dq %zmm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64to8si: +; SKX: # BB#0: +; SKX-NEXT: vcvttpd2dq %zmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = fptosi <8 x double> %a to <8 x i32> ret <8 x i32> %b } define <4 x i32> @f64to4si(<4 x double> %a) { -; CHECK-LABEL: f64to4si: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [7:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f64to4si: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [4:1.00] +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64to4si: +; SKX: # BB#0: +; SKX-NEXT: vcvttpd2dq %ymm0, %xmm0 # sched: [7:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = fptosi <4 x double> %a to <4 x i32> ret <4 x i32> %b } define <16 x float> @f64to16f32(<16 x double> %b) nounwind { -; CHECK-LABEL: f64to16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtpd2ps %zmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: vcvtpd2ps %zmm1, %ymm1 # sched: [7:1.00] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f64to16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtpd2ps %zmm0, %ymm0 +; GENERIC-NEXT: vcvtpd2ps %zmm1, %ymm1 +; GENERIC-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64to16f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtpd2ps %zmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: vcvtpd2ps %zmm1, %ymm1 # sched: [7:1.00] +; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = fptrunc <16 x double> %b to <16 x float> ret <16 x float> %a } define <4 x float> @f64to4f32(<4 x double> %b) { -; CHECK-LABEL: f64to4f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [7:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f64to4f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [4:1.00] +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64to4f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0 # sched: [7:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = fptrunc <4 x double> %b to <4 x float> ret <4 x float> %a } define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) { -; CHECK-LABEL: f64to4f32_mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} # sched: [7:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f64to4f32_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm1, %xmm1, %k1 +; GENERIC-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64to4f32_mask: +; SKX: # BB#0: +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} # sched: [7:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = fptrunc <4 x double> %b to <4 x float> %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer ret <4 x float> %c } define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { -; CHECK-LABEL: f64tof32_inreg: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f64tof32_inreg: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64tof32_inreg: +; SKX: # BB#0: +; SKX-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %ext = extractelement <2 x double> %a0, i32 0 %cvt = fptrunc double %ext to float %res = insertelement <4 x float> %a1, float %cvt, i32 0 @@ -1350,20 +1863,31 @@ define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { } define <8 x double> @f32to8f64(<8 x float> %b) nounwind { -; CHECK-LABEL: f32to8f64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f32to8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtps2pd %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f32to8f64: +; SKX: # BB#0: +; SKX-NEXT: vcvtps2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = fpext <8 x float> %b to <8 x double> ret <8 x double> %a } define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) { -; CHECK-LABEL: f32to4f64_mask: -; CHECK: # BB#0: -; CHECK-NEXT: vcmpltpd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f32to4f64_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcmpltpd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f32to4f64_mask: +; SKX: # BB#0: +; SKX-NEXT: vcmpltpd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = fpext <4 x float> %b to <4 x double> %mask = fcmp ogt <4 x double> %a1, %b1 %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer @@ -1371,10 +1895,15 @@ define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x doubl } define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { -; CHECK-LABEL: f32tof64_inreg: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f32tof64_inreg: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f32tof64_inreg: +; SKX: # BB#0: +; SKX-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %ext = extractelement <4 x float> %a1, i32 0 %cvt = fpext float %ext to double %res = insertelement <2 x double> %a0, double %cvt, i32 0 @@ -1382,10 +1911,15 @@ define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind } define double @sltof64_load(i64* nocapture %e) { -; CHECK-LABEL: sltof64_load: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sltof64_load: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sltof64_load: +; SKX: # BB#0: # %entry +; SKX-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %tmp1 = load i64, i64* %e, align 8 %conv = sitofp i64 %tmp1 to double @@ -1393,10 +1927,15 @@ entry: } define double @sitof64_load(i32* %e) { -; CHECK-LABEL: sitof64_load: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sitof64_load: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sitof64_load: +; SKX: # BB#0: # %entry +; SKX-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %tmp1 = load i32, i32* %e, align 4 %conv = sitofp i32 %tmp1 to double @@ -1404,10 +1943,15 @@ entry: } define float @sitof32_load(i32* %e) { -; CHECK-LABEL: sitof32_load: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sitof32_load: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sitof32_load: +; SKX: # BB#0: # %entry +; SKX-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %tmp1 = load i32, i32* %e, align 4 %conv = sitofp i32 %tmp1 to float @@ -1415,10 +1959,15 @@ entry: } define float @sltof32_load(i64* %e) { -; CHECK-LABEL: sltof32_load: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sltof32_load: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 # sched: [10:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sltof32_load: +; SKX: # BB#0: # %entry +; SKX-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %tmp1 = load i64, i64* %e, align 8 %conv = sitofp i64 %tmp1 to float @@ -1426,12 +1975,19 @@ entry: } define void @f32tof64_loadstore() { -; CHECK-LABEL: f32tof64_loadstore: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] -; CHECK-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00] -; CHECK-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f32tof64_loadstore: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] +; GENERIC-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f32tof64_loadstore: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] +; SKX-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; SKX-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %f = alloca float, align 4 %d = alloca double, align 8 @@ -1442,12 +1998,19 @@ entry: } define void @f64tof32_loadstore() nounwind uwtable { -; CHECK-LABEL: f64tof32_loadstore: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50] -; CHECK-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] -; CHECK-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: f64tof32_loadstore: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50] +; GENERIC-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [4:1.00] +; GENERIC-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: f64tof32_loadstore: +; SKX: # BB#0: # %entry +; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50] +; SKX-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 # sched: [5:1.00] +; SKX-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %f = alloca float, align 4 %d = alloca double, align 8 @@ -1458,149 +2021,209 @@ entry: } define double @long_to_double(i64 %x) { -; CHECK-LABEL: long_to_double: -; CHECK: # BB#0: -; CHECK-NEXT: vmovq %rdi, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: long_to_double: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovq %rdi, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: long_to_double: +; SKX: # BB#0: +; SKX-NEXT: vmovq %rdi, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %res = bitcast i64 %x to double ret double %res } define i64 @double_to_long(double %x) { -; CHECK-LABEL: double_to_long: -; CHECK: # BB#0: -; CHECK-NEXT: vmovq %xmm0, %rax # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: double_to_long: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovq %xmm0, %rax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: double_to_long: +; SKX: # BB#0: +; SKX-NEXT: vmovq %xmm0, %rax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %res = bitcast double %x to i64 ret i64 %res } define float @int_to_float(i32 %x) { -; CHECK-LABEL: int_to_float: -; CHECK: # BB#0: -; CHECK-NEXT: vmovd %edi, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: int_to_float: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovd %edi, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: int_to_float: +; SKX: # BB#0: +; SKX-NEXT: vmovd %edi, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %res = bitcast i32 %x to float ret float %res } define i32 @float_to_int(float %x) { -; CHECK-LABEL: float_to_int: -; CHECK: # BB#0: -; CHECK-NEXT: vmovd %xmm0, %eax # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: float_to_int: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovd %xmm0, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: float_to_int: +; SKX: # BB#0: +; SKX-NEXT: vmovd %xmm0, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %res = bitcast float %x to i32 ret i32 %res } define <16 x double> @uito16f64(<16 x i32> %a) nounwind { -; CHECK-LABEL: uito16f64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm2 # sched: [7:1.00] -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [3:1.00] -; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm1 # sched: [7:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: uito16f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm2 +; GENERIC-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm1 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uito16f64: +; SKX: # BB#0: +; SKX-NEXT: vcvtudq2pd %ymm0, %zmm2 # sched: [7:1.00] +; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [3:1.00] +; SKX-NEXT: vcvtudq2pd %ymm0, %zmm1 # sched: [7:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i32> %a to <16 x double> ret <16 x double> %b } define <8 x float> @slto8f32(<8 x i64> %a) { -; CHECK-LABEL: slto8f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtqq2ps %zmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: slto8f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtqq2ps %zmm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: slto8f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtqq2ps %zmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <8 x i64> %a to <8 x float> ret <8 x float> %b } define <16 x float> @slto16f32(<16 x i64> %a) { -; CHECK-LABEL: slto16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtqq2ps %zmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: vcvtqq2ps %zmm1, %ymm1 # sched: [7:1.00] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: slto16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtqq2ps %zmm0, %ymm0 +; GENERIC-NEXT: vcvtqq2ps %zmm1, %ymm1 +; GENERIC-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: slto16f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtqq2ps %zmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: vcvtqq2ps %zmm1, %ymm1 # sched: [7:1.00] +; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i64> %a to <16 x float> ret <16 x float> %b } define <8 x double> @slto8f64(<8 x i64> %a) { -; CHECK-LABEL: slto8f64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: slto8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtqq2pd %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: slto8f64: +; SKX: # BB#0: +; SKX-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <8 x i64> %a to <8 x double> ret <8 x double> %b } define <16 x double> @slto16f64(<16 x i64> %a) { -; CHECK-LABEL: slto16f64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: vcvtqq2pd %zmm1, %zmm1 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: slto16f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtqq2pd %zmm0, %zmm0 +; GENERIC-NEXT: vcvtqq2pd %zmm1, %zmm1 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: slto16f64: +; SKX: # BB#0: +; SKX-NEXT: vcvtqq2pd %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: vcvtqq2pd %zmm1, %zmm1 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i64> %a to <16 x double> ret <16 x double> %b } define <8 x float> @ulto8f32(<8 x i64> %a) { -; CHECK-LABEL: ulto8f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtuqq2ps %zmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ulto8f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ulto8f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtuqq2ps %zmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <8 x i64> %a to <8 x float> ret <8 x float> %b } define <16 x float> @ulto16f32(<16 x i64> %a) { -; CHECK-LABEL: ulto16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtuqq2ps %zmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: vcvtuqq2ps %zmm1, %ymm1 # sched: [7:1.00] -; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ulto16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtuqq2ps %zmm0, %ymm0 +; GENERIC-NEXT: vcvtuqq2ps %zmm1, %ymm1 +; GENERIC-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ulto16f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtuqq2ps %zmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: vcvtuqq2ps %zmm1, %ymm1 # sched: [7:1.00] +; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i64> %a to <16 x float> ret <16 x float> %b } define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { -; CHECK-LABEL: uito8f64_mask: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLBW-LABEL: uito8f64_mask: -; VLBW: # BB#0: -; VLBW-NEXT: kmovd %edi, %k1 -; VLBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} -; VLBW-NEXT: retq +; GENERIC-LABEL: uito8f64_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uito8f64_mask: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] ; VLNOBW-LABEL: uito8f64_mask: ; VLNOBW: # BB#0: ; VLNOBW-NEXT: kmovw %edi, %k1 ; VLNOBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} -; VLNOBW-NEXT: retq +; VLNOBW-NEXT: ret{{[l|q]}} %1 = bitcast i8 %c to <8 x i1> %2 = uitofp <8 x i32> %b to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a ret <8 x double> %3 } define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { -; CHECK-LABEL: uito8f64_maskz: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLBW-LABEL: uito8f64_maskz: -; VLBW: # BB#0: -; VLBW-NEXT: kmovd %edi, %k1 -; VLBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} -; VLBW-NEXT: retq -; VLNOBW-LABEL: uito8f64_maskz: -; VLNOBW: # BB#0: -; VLNOBW-NEXT: kmovw %edi, %k1 -; VLNOBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} -; VLNOBW-NEXT: retq +; GENERIC-LABEL: uito8f64_maskz: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uito8f64_maskz: +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %1 = bitcast i8 %b to <8 x i1> %2 = uitofp <8 x i32> %a to <8 x double> %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer @@ -1608,671 +2231,795 @@ define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { } define <4 x double> @uito4f64(<4 x i32> %a) nounwind { -; CHECK-LABEL: uito4f64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: uito4f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtudq2pd %xmm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uito4f64: +; SKX: # BB#0: +; SKX-NEXT: vcvtudq2pd %xmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <4 x i32> %a to <4 x double> ret <4 x double> %b } define <16 x float> @uito16f32(<16 x i32> %a) nounwind { -; CHECK-LABEL: uito16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtudq2ps %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: uito16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtudq2ps %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uito16f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtudq2ps %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i32> %a to <16 x float> ret <16 x float> %b } define <8 x double> @uito8f64(<8 x i32> %a) { -; CHECK-LABEL: uito8f64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: uito8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uito8f64: +; SKX: # BB#0: +; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <8 x i32> %a to <8 x double> ret <8 x double> %b } define <8 x float> @uito8f32(<8 x i32> %a) nounwind { -; CHECK-LABEL: uito8f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: uito8f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtudq2ps %ymm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uito8f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtudq2ps %ymm0, %ymm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <8 x i32> %a to <8 x float> ret <8 x float> %b } define <4 x float> @uito4f32(<4 x i32> %a) nounwind { -; CHECK-LABEL: uito4f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: uito4f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtudq2ps %xmm0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uito4f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtudq2ps %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <4 x i32> %a to <4 x float> ret <4 x float> %b } define i32 @fptosi(float %a) nounwind { -; CHECK-LABEL: fptosi: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttss2si %xmm0, %eax # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: fptosi: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttss2si %xmm0, %eax # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: fptosi: +; SKX: # BB#0: +; SKX-NEXT: vcvttss2si %xmm0, %eax # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = fptosi float %a to i32 ret i32 %b } define i32 @fptoui(float %a) nounwind { -; CHECK-LABEL: fptoui: -; CHECK: # BB#0: -; CHECK-NEXT: vcvttss2usi %xmm0, %eax -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: fptoui: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvttss2usi %xmm0, %eax +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: fptoui: +; SKX: # BB#0: +; SKX-NEXT: vcvttss2usi %xmm0, %eax +; SKX-NEXT: retq # sched: [7:1.00] %b = fptoui float %a to i32 ret i32 %b } define float @uitof32(i32 %a) nounwind { -; CHECK-LABEL: uitof32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: uitof32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uitof32: +; SKX: # BB#0: +; SKX-NEXT: vcvtusi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp i32 %a to float ret float %b } define double @uitof64(i32 %a) nounwind { -; CHECK-LABEL: uitof64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: uitof64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uitof64: +; SKX: # BB#0: +; SKX-NEXT: vcvtusi2sdl %edi, %xmm0, %xmm0 # sched: [5:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp i32 %a to double ret double %b } define <16 x float> @sbto16f32(<16 x i32> %a) { -; CHECK-LABEL: sbto16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2d %k0, %zmm0 -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sbto16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 +; GENERIC-NEXT: vpmovm2d %k0, %zmm0 +; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sbto16f32: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %zmm0 +; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <16 x i32> %a, zeroinitializer %1 = sitofp <16 x i1> %mask to <16 x float> ret <16 x float> %1 } define <16 x float> @scto16f32(<16 x i8> %a) { -; CHECK-LABEL: scto16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: scto16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 +; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: scto16f32: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %1 = sitofp <16 x i8> %a to <16 x float> ret <16 x float> %1 } define <16 x float> @ssto16f32(<16 x i16> %a) { -; CHECK-LABEL: ssto16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ssto16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm0 +; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ssto16f32: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %1 = sitofp <16 x i16> %a to <16 x float> ret <16 x float> %1 } define <8 x double> @ssto16f64(<8 x i16> %a) { -; CHECK-LABEL: ssto16f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ssto16f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ssto16f64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %1 = sitofp <8 x i16> %a to <8 x double> ret <8 x double> %1 } define <8 x double> @scto8f64(<8 x i8> %a) { -; CHECK-LABEL: scto8f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] -; CHECK-NEXT: vpslld $24, %ymm0, %ymm0 # sched: [1:0.50] -; CHECK-NEXT: vpsrad $24, %ymm0, %ymm0 # sched: [1:0.50] -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: scto8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] +; GENERIC-NEXT: vpslld $24, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vpsrad $24, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: scto8f64: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] +; SKX-NEXT: vpslld $24, %ymm0, %ymm0 # sched: [1:0.50] +; SKX-NEXT: vpsrad $24, %ymm0, %ymm0 # sched: [1:0.50] +; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %1 = sitofp <8 x i8> %a to <8 x double> ret <8 x double> %1 } define <16 x double> @scto16f64(<16 x i8> %a) { -; CHECK-LABEL: scto16f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxbd %xmm0, %zmm1 # sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: scto16f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm1 +; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 +; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: scto16f64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbd %xmm0, %zmm1 # sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] +; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i8> %a to <16 x double> ret <16 x double> %b } define <16 x double> @sbto16f64(<16 x double> %a) { -; NOVLDQ-LABEL: sbto16f64: -; NOVLDQ: # BB#0: -; NOVLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; NOVLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 -; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2 -; NOVLDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k2} {z} -; NOVLDQ-NEXT: vpmovqd %zmm0, %ymm0 -; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; NOVLDQ-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NOVLDQ-NEXT: vpmovqd %zmm1, %ymm1 -; NOVLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: sbto16f64: -; VLDQ: # BB#0: -; VLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; VLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k0 -; VLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k1 -; VLDQ-NEXT: vpmovm2d %k1, %ymm0 -; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLDQ-NEXT: vpmovm2d %k0, %ymm1 -; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: sbto16f64: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; VLNODQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 -; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm2, %k2 -; VLNODQ-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm0 {%k2} {z} -; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLNODQ-NEXT: vmovdqa32 %ymm1, %ymm1 {%k1} {z} -; VLNODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 -; VLNODQ-NEXT: retq -; -; CHECK-LABEL: sbto16f64: -; CHECK: # BB#0: -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vcmpltpd %zmm1, %zmm2, %k0 # sched: [3:1.00] -; CHECK-NEXT: vcmpltpd %zmm0, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2d %k1, %ymm0 -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: vpmovm2d %k0, %ymm1 -; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sbto16f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00] +; GENERIC-NEXT: vcmpltpd %zmm1, %zmm2, %k0 +; GENERIC-NEXT: vcmpltpd %zmm0, %zmm2, %k1 +; GENERIC-NEXT: vpmovm2d %k1, %ymm0 +; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 +; GENERIC-NEXT: vpmovm2d %k0, %ymm1 +; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sbto16f64: +; SKX: # BB#0: +; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vcmpltpd %zmm1, %zmm2, %k0 # sched: [3:1.00] +; SKX-NEXT: vcmpltpd %zmm0, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k1, %ymm0 +; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: vpmovm2d %k0, %ymm1 +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <16 x double> %a, zeroinitializer %1 = sitofp <16 x i1> %cmpres to <16 x double> ret <16 x double> %1 } define <8 x double> @sbto8f64(<8 x double> %a) { -; NOVLDQ-LABEL: sbto8f64: -; NOVLDQ: # BB#0: -; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vpmovqd %zmm0, %ymm0 -; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: sbto8f64: -; VLDQ: # BB#0: -; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %ymm0 -; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: sbto8f64: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLNODQ-NEXT: retq -; -; CHECK-LABEL: sbto8f64: -; CHECK: # BB#0: -; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vcmpltpd %zmm0, %zmm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2d %k0, %ymm0 -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sbto8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vcmpltpd %zmm0, %zmm1, %k0 +; GENERIC-NEXT: vpmovm2d %k0, %ymm0 +; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sbto8f64: +; SKX: # BB#0: +; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %ymm0 +; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <8 x double> %a, zeroinitializer %1 = sitofp <8 x i1> %cmpres to <8 x double> ret <8 x double> %1 } define <8 x float> @sbto8f32(<8 x float> %a) { -; NOVLDQ-LABEL: sbto8f32: -; NOVLDQ: # BB#0: -; NOVLDQ-NEXT: # kill: %YMM0 %YMM0 %ZMM0 -; NOVLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vcmpltps %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vpmovqd %zmm0, %ymm0 -; NOVLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0 -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: sbto8f32: -; VLDQ: # BB#0: -; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltps %ymm0, %ymm1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %ymm0 -; VLDQ-NEXT: vcvtdq2ps %ymm0, %ymm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: sbto8f32: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltps %ymm0, %ymm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; VLNODQ-NEXT: vcvtdq2ps %ymm0, %ymm0 -; VLNODQ-NEXT: retq -; -; CHECK-LABEL: sbto8f32: -; CHECK: # BB#0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vcmpltps %ymm0, %ymm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2d %k0, %ymm0 -; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sbto8f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vcmpltps %ymm0, %ymm1, %k0 +; GENERIC-NEXT: vpmovm2d %k0, %ymm0 +; GENERIC-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [3:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sbto8f32: +; SKX: # BB#0: +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vcmpltps %ymm0, %ymm1, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %ymm0 +; SKX-NEXT: vcvtdq2ps %ymm0, %ymm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <8 x float> %a, zeroinitializer %1 = sitofp <8 x i1> %cmpres to <8 x float> ret <8 x float> %1 } define <4 x float> @sbto4f32(<4 x float> %a) { -; CHECK-LABEL: sbto4f32: -; CHECK: # BB#0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2d %k0, %xmm0 -; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: sbto4f32: -; VLDQ: # BB#0: -; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %xmm0 -; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 -; VLDQ-NEXT: retq -; VLNODQ-LABEL: sbto4f32: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0 -; VLNODQ-NEXT: retq +; GENERIC-LABEL: sbto4f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vcmpltps %xmm0, %xmm1, %k0 +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 +; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sbto4f32: +; SKX: # BB#0: +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <4 x float> %a, zeroinitializer %1 = sitofp <4 x i1> %cmpres to <4 x float> ret <4 x float> %1 } define <4 x double> @sbto4f64(<4 x double> %a) { -; CHECK-LABEL: sbto4f64: -; CHECK: # BB#0: -; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vcmpltpd %ymm0, %ymm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2d %k0, %xmm0 -; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: sbto4f64: -; VLDQ: # BB#0: -; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltpd %ymm0, %ymm1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %xmm0 -; VLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0 -; VLDQ-NEXT: retq -; VLNODQ-LABEL: sbto4f64: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltpd %ymm0, %ymm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0 -; VLNODQ-NEXT: retq +; GENERIC-LABEL: sbto4f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vcmpltpd %ymm0, %ymm1, %k0 +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 +; GENERIC-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [4:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sbto4f64: +; SKX: # BB#0: +; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vcmpltpd %ymm0, %ymm1, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: vcvtdq2pd %xmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <4 x double> %a, zeroinitializer %1 = sitofp <4 x i1> %cmpres to <4 x double> ret <4 x double> %1 } define <2 x float> @sbto2f32(<2 x float> %a) { -; CHECK-LABEL: sbto2f32: -; CHECK: # BB#0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2d %k0, %xmm0 -; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: sbto2f32: -; VLDQ: # BB#0: -; VLDQ-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltps %xmm0, %xmm1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %xmm0 -; VLDQ-NEXT: vcvtdq2ps %xmm0, %xmm0 -; VLDQ-NEXT: retq -; VLNODQ-LABEL: sbto2f32: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltps %xmm0, %xmm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vcvtdq2ps %xmm0, %xmm0 -; VLNODQ-NEXT: retq +; GENERIC-LABEL: sbto2f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vcmpltps %xmm0, %xmm1, %k0 +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 +; GENERIC-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sbto2f32: +; SKX: # BB#0: +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vcmpltps %xmm0, %xmm1, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %xmm0 +; SKX-NEXT: vcvtdq2ps %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <2 x float> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x float> ret <2 x float> %1 } define <2 x double> @sbto2f64(<2 x double> %a) { -; CHECK-LABEL: sbto2f64: -; CHECK: # BB#0: -; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2q %k0, %xmm0 -; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: sbto2f64: -; VLDQ: # BB#0: -; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0 -; VLDQ-NEXT: vpmovm2q %k0, %xmm0 -; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 -; VLDQ-NEXT: retq -; VLNODQ-LABEL: sbto2f64: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm1 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2sdq %rax, %xmm2, %xmm0 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; VLNODQ-NEXT: retq +; GENERIC-LABEL: sbto2f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vcmpltpd %xmm0, %xmm1, %k0 +; GENERIC-NEXT: vpmovm2q %k0, %xmm0 +; GENERIC-NEXT: vcvtqq2pd %xmm0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sbto2f64: +; SKX: # BB#0: +; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vcmpltpd %xmm0, %xmm1, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2q %k0, %xmm0 +; SKX-NEXT: vcvtqq2pd %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %cmpres = fcmp ogt <2 x double> %a, zeroinitializer %1 = sitofp <2 x i1> %cmpres to <2 x double> ret <2 x double> %1 } define <16 x float> @ucto16f32(<16 x i8> %a) { -; CHECK-LABEL: ucto16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00] -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ucto16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ucto16f32: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00] +; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i8> %a to <16 x float> ret <16 x float>%b } define <8 x double> @ucto8f64(<8 x i8> %a) { -; CHECK-LABEL: ucto8f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ucto8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] +; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] +; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ucto8f64: +; SKX: # BB#0: +; SKX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 # sched: [7:0.50] +; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <8 x i8> %a to <8 x double> ret <8 x double> %b } define <16 x float> @swto16f32(<16 x i16> %a) { -; CHECK-LABEL: swto16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: swto16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm0 +; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: swto16f32: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxwd %ymm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i16> %a to <16 x float> ret <16 x float> %b } define <8 x double> @swto8f64(<8 x i16> %a) { -; CHECK-LABEL: swto8f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: swto8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: swto8f64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxwd %xmm0, %ymm0 # sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <8 x i16> %a to <8 x double> ret <8 x double> %b } define <16 x double> @swto16f64(<16 x i16> %a) { -; CHECK-LABEL: swto16f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxwd %ymm0, %zmm1 # sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: swto16f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxwd %ymm0, %zmm1 +; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 +; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: swto16f64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxwd %ymm0, %zmm1 # sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] +; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i16> %a to <16 x double> ret <16 x double> %b } define <16 x double> @ucto16f64(<16 x i8> %a) { -; CHECK-LABEL: ucto16f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ucto16f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 +; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ucto16f64: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] +; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i8> %a to <16 x double> ret <16 x double> %b } define <16 x float> @uwto16f32(<16 x i16> %a) { -; CHECK-LABEL: uwto16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: uwto16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uwto16f32: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] +; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i16> %a to <16 x float> ret <16 x float> %b } define <8 x double> @uwto8f64(<8 x i16> %a) { -; CHECK-LABEL: uwto8f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: uwto8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] +; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uwto8f64: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <8 x i16> %a to <8 x double> ret <8 x double> %b } define <16 x double> @uwto16f64(<16 x i16> %a) { -; CHECK-LABEL: uwto16f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: uwto16f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm0 +; GENERIC-NEXT: vextracti64x4 $1, %zmm1, %ymm1 +; GENERIC-NEXT: vcvtdq2pd %ymm1, %zmm1 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: uwto16f64: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm0 # sched: [7:1.00] +; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 # sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm1, %zmm1 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i16> %a to <16 x double> ret <16 x double> %b } define <16 x float> @sito16f32(<16 x i32> %a) { -; CHECK-LABEL: sito16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sito16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sito16f32: +; SKX: # BB#0: +; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i32> %a to <16 x float> ret <16 x float> %b } define <16 x double> @sito16f64(<16 x i32> %a) { -; CHECK-LABEL: sito16f64: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm2 # sched: [7:1.00] -; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [3:1.00] -; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm1 # sched: [7:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sito16f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm2 +; GENERIC-NEXT: vextractf64x4 $1, %zmm0, %ymm0 +; GENERIC-NEXT: vcvtdq2pd %ymm0, %zmm1 +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sito16f64: +; SKX: # BB#0: +; SKX-NEXT: vcvtdq2pd %ymm0, %zmm2 # sched: [7:1.00] +; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # sched: [3:1.00] +; SKX-NEXT: vcvtdq2pd %ymm0, %zmm1 # sched: [7:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = sitofp <16 x i32> %a to <16 x double> ret <16 x double> %b } define <16 x float> @usto16f32(<16 x i16> %a) { -; CHECK-LABEL: usto16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] -; CHECK-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: usto16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; GENERIC-NEXT: vcvtdq2ps %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: usto16f32: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] +; SKX-NEXT: vcvtdq2ps %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b = uitofp <16 x i16> %a to <16 x float> ret <16 x float> %b } define <16 x float> @ubto16f32(<16 x i32> %a) { -; CHECK-LABEL: ubto16f32: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: vcvtudq2ps %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ubto16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; GENERIC-NEXT: vcvtudq2ps %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ubto16f32: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: vcvtudq2ps %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <16 x i32> %a, zeroinitializer %1 = uitofp <16 x i1> %mask to <16 x float> ret <16 x float> %1 } define <16 x double> @ubto16f64(<16 x i32> %a) { -; CHECK-LABEL: ubto16f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: movl {{.*}}(%rip), %eax # sched: [5:0.50] -; CHECK-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: kshiftrw $8, %k1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: vcvtudq2pd %ymm1, %zmm1 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ubto16f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 +; GENERIC-NEXT: movl {{.*}}(%rip), %eax # sched: [5:0.50] +; GENERIC-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} +; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm0 +; GENERIC-NEXT: kshiftrw $8, %k1, %k1 +; GENERIC-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} +; GENERIC-NEXT: vcvtudq2pd %ymm1, %zmm1 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ubto16f64: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: movl {{.*}}(%rip), %eax # sched: [5:0.50] +; SKX-NEXT: vpbroadcastd %eax, %ymm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: kshiftrw $8, %k1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpbroadcastd %eax, %ymm1 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: vcvtudq2pd %ymm1, %zmm1 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <16 x i32> %a, zeroinitializer %1 = uitofp <16 x i1> %mask to <16 x double> ret <16 x double> %1 } define <8 x float> @ubto8f32(<8 x i32> %a) { -; CHECK-LABEL: ubto8f32: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ubto8f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 +; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; GENERIC-NEXT: vcvtudq2ps %ymm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ubto8f32: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: vcvtudq2ps %ymm0, %ymm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <8 x i32> %a, zeroinitializer %1 = uitofp <8 x i1> %mask to <8 x float> ret <8 x float> %1 } define <8 x double> @ubto8f64(<8 x i32> %a) { -; CHECK-LABEL: ubto8f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ubto8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 +; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} +; GENERIC-NEXT: vcvtudq2pd %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ubto8f64: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: vcvtudq2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <8 x i32> %a, zeroinitializer %1 = uitofp <8 x i1> %mask to <8 x double> ret <8 x double> %1 } define <4 x float> @ubto4f32(<4 x i32> %a) { -; CHECK-LABEL: ubto4f32: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] -; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ubto4f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 +; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; GENERIC-NEXT: vcvtudq2ps %xmm0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ubto4f32: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] +; SKX-NEXT: vcvtudq2ps %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <4 x i32> %a, zeroinitializer %1 = uitofp <4 x i1> %mask to <4 x float> ret <4 x float> %1 } define <4 x double> @ubto4f64(<4 x i32> %a) { -; CHECK-LABEL: ubto4f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] -; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ubto4f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 +; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; GENERIC-NEXT: vcvtudq2pd %xmm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ubto4f64: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpgtd %xmm0, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] +; SKX-NEXT: vcvtudq2pd %xmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp slt <4 x i32> %a, zeroinitializer %1 = uitofp <4 x i1> %mask to <4 x double> ret <4 x double> %1 } define <2 x float> @ubto2f32(<2 x i32> %a) { -; CHECK-LABEL: ubto2f32: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] -; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ubto2f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] +; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; GENERIC-NEXT: vcvtudq2ps %xmm0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ubto2f32: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] +; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] +; SKX-NEXT: vcvtudq2ps %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ult <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x float> ret <2 x float> %1 } define <2 x double> @ubto2f64(<2 x i32> %a) { -; CHECK-LABEL: ubto2f64: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] -; CHECK-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] -; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; VLDQ-LABEL: ubto2f64: -; VLDQ: # BB#0: -; VLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; VLDQ-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 -; VLDQ-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} -; VLDQ-NEXT: vcvtuqq2pd %xmm0, %xmm0 -; VLDQ-NEXT: retq -; VLNODQ-LABEL: ubto2f64: -; VLNODQ: # BB#0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; VLNODQ-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 -; VLNODQ-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm1 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtusi2sdq %rax, %xmm2, %xmm0 -; VLNODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; VLNODQ-NEXT: retq +; GENERIC-LABEL: ubto2f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.50] +; GENERIC-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: vcvtuqq2pd %xmm0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: ubto2f64: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] sched: [1:0.33] +; SKX-NEXT: vpcmpltuq %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] +; SKX-NEXT: vcvtuqq2pd %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ult <2 x i32> %a, zeroinitializer %1 = uitofp <2 x i1> %mask to <2 x double> ret <2 x double> %1 } define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_8x8mem_to_8x16: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8x8mem_to_8x16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k1 +; GENERIC-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_8x8mem_to_8x16: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxbw {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = zext <8 x i8> %a to <8 x i16> %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer @@ -2280,18 +3027,19 @@ define <8 x i16> @zext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind re } define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_8x8mem_to_8x16: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8x8mem_to_8x16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k1 +; GENERIC-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_8x8mem_to_8x16: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxbw (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = sext <8 x i8> %a to <8 x i16> %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> zeroinitializer @@ -2300,18 +3048,19 @@ define <8 x i16> @sext_8x8mem_to_8x16(<8 x i8> *%i , <8 x i1> %mask) nounwind re define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_16x8mem_to_16x16: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16x8mem_to_16x16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm0, %k1 +; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_16x8mem_to_16x16: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 -; SKX-NEXT: vpmovb2m %xmm0, %k1 -; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i8>,<16 x i8> *%i,align 1 %x = zext <16 x i8> %a to <16 x i16> %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer @@ -2319,18 +3068,19 @@ define <16 x i16> @zext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwi } define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_16x8mem_to_16x16: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_16x8mem_to_16x16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm0, %k1 +; GENERIC-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_16x8mem_to_16x16: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 -; SKX-NEXT: vpmovb2m %xmm0, %k1 -; SKX-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxbw (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i8>,<16 x i8> *%i,align 1 %x = sext <16 x i8> %a to <16 x i16> %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer @@ -2338,72 +3088,85 @@ define <16 x i16> @sext_16x8mem_to_16x16(<16 x i8> *%i , <16 x i1> %mask) nounwi } define <16 x i16> @zext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone { -; CHECK-LABEL: zext_16x8_to_16x16: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16x8_to_16x16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: zext_16x8_to_16x16: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <16 x i8> %a to <16 x i16> ret <16 x i16> %x } define <16 x i16> @zext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_16x8_to_16x16_mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16x8_to_16x16_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm1, %k1 +; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_16x8_to_16x16_mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 -; SKX-NEXT: vpmovb2m %xmm1, %k1 -; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <16 x i8> %a to <16 x i16> %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer ret <16 x i16> %ret } define <16 x i16> @sext_16x8_to_16x16(<16 x i8> %a ) nounwind readnone { -; CHECK-LABEL: sext_16x8_to_16x16: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_16x8_to_16x16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_16x8_to_16x16: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = sext <16 x i8> %a to <16 x i16> ret <16 x i16> %x } define <16 x i16> @sext_16x8_to_16x16_mask(<16 x i8> %a ,<16 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_16x8_to_16x16_mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_16x8_to_16x16_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm1, %k1 +; GENERIC-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_16x8_to_16x16_mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 -; SKX-NEXT: vpmovb2m %xmm1, %k1 -; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = sext <16 x i8> %a to <16 x i16> %ret = select <16 x i1> %mask, <16 x i16> %x, <16 x i16> zeroinitializer ret <16 x i16> %ret } define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_32x8mem_to_32x16: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_32x8mem_to_32x16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %ymm0, %k1 +; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_32x8mem_to_32x16: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 -; SKX-NEXT: vpmovb2m %ymm0, %k1 -; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero,mem[16],zero,mem[17],zero,mem[18],zero,mem[19],zero,mem[20],zero,mem[21],zero,mem[22],zero,mem[23],zero,mem[24],zero,mem[25],zero,mem[26],zero,mem[27],zero,mem[28],zero,mem[29],zero,mem[30],zero,mem[31],zero sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <32 x i8>,<32 x i8> *%i,align 1 %x = zext <32 x i8> %a to <32 x i16> %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer @@ -2411,18 +3174,19 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi } define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_32x8mem_to_32x16: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_32x8mem_to_32x16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %ymm0, %k1 +; GENERIC-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_32x8mem_to_32x16: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 -; SKX-NEXT: vpmovb2m %ymm0, %k1 -; SKX-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %ymm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxbw (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <32 x i8>,<32 x i8> *%i,align 1 %x = sext <32 x i8> %a to <32 x i16> %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer @@ -2430,80 +3194,85 @@ define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi } define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { -; CHECK-LABEL: zext_32x8_to_32x16: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_32x8_to_32x16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_32x8_to_32x16: ; SKX: # BB#0: -; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; SKX-NEXT: retq +; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <32 x i8> %a to <32 x i16> ret <32 x i16> %x } define <32 x i16> @zext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_32x8_to_32x16_mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_32x8_to_32x16_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %ymm1, %k1 +; GENERIC-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_32x8_to_32x16_mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 -; SKX-NEXT: vpmovb2m %ymm1, %k1 -; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxbw {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <32 x i8> %a to <32 x i16> %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { -; CHECK-LABEL: sext_32x8_to_32x16: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxbw %ymm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_32x8_to_32x16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxbw %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_32x8_to_32x16: ; SKX: # BB#0: -; SKX-NEXT: vpmovsxbw %ymm0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: vpmovsxbw %ymm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = sext <32 x i8> %a to <32 x i16> ret <32 x i16> %x } define <32 x i16> @sext_32x8_to_32x16_mask(<32 x i8> %a ,<32 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_32x8_to_32x16_mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_32x8_to_32x16_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %ymm1, %k1 +; GENERIC-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_32x8_to_32x16_mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 -; SKX-NEXT: vpmovb2m %ymm1, %k1 -; SKX-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = sext <32 x i8> %a to <32 x i16> %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_4x8mem_to_4x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_4x8mem_to_4x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_4x8mem_to_4x32: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SKX-NEXT: retq +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovzxbd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = zext <4 x i8> %a to <4 x i32> %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer @@ -2511,18 +3280,19 @@ define <4 x i32> @zext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re } define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_4x8mem_to_4x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_4x8mem_to_4x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_4x8mem_to_4x32: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovsxbd (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = sext <4 x i8> %a to <4 x i32> %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer @@ -2530,18 +3300,19 @@ define <4 x i32> @sext_4x8mem_to_4x32(<4 x i8> *%i , <4 x i1> %mask) nounwind re } define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_8x8mem_to_8x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8x8mem_to_8x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k1 +; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_8x8mem_to_8x32: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = zext <8 x i8> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer @@ -2549,18 +3320,19 @@ define <8 x i32> @zext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re } define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_8x8mem_to_8x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8x8mem_to_8x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k1 +; GENERIC-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_8x8mem_to_8x32: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxbd (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = sext <8 x i8> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer @@ -2568,18 +3340,19 @@ define <8 x i32> @sext_8x8mem_to_8x32(<8 x i8> *%i , <8 x i1> %mask) nounwind re } define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_16x8mem_to_16x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16x8mem_to_16x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm0, %k1 +; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_16x8mem_to_16x32: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 -; SKX-NEXT: vpmovb2m %xmm0, %k1 -; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i8>,<16 x i8> *%i,align 1 %x = zext <16 x i8> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer @@ -2587,18 +3360,19 @@ define <16 x i32> @zext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi } define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_16x8mem_to_16x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_16x8mem_to_16x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm0, %k1 +; GENERIC-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_16x8mem_to_16x32: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 -; SKX-NEXT: vpmovb2m %xmm0, %k1 -; SKX-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxbd (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i8>,<16 x i8> *%i,align 1 %x = sext <16 x i8> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer @@ -2606,118 +3380,138 @@ define <16 x i32> @sext_16x8mem_to_16x32(<16 x i8> *%i , <16 x i1> %mask) nounwi } define <16 x i32> @zext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_16x8_to_16x32_mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16x8_to_16x32_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm1, %k1 +; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_16x8_to_16x32_mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 -; SKX-NEXT: vpmovb2m %xmm1, %k1 -; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <16 x i8> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } define <16 x i32> @sext_16x8_to_16x32_mask(<16 x i8> %a , <16 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_16x8_to_16x32_mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_16x8_to_16x32_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm1, %k1 +; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_16x8_to_16x32_mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 -; SKX-NEXT: vpmovb2m %xmm1, %k1 -; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = sext <16 x i8> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone { -; CHECK-LABEL: zext_16x8_to_16x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16x8_to_16x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: zext_16x8_to_16x32: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <16 x i8> %i to <16 x i32> ret <16 x i32> %x } define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone { -; CHECK-LABEL: sext_16x8_to_16x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_16x8_to_16x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxbd %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_16x8_to_16x32: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbd %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = sext <16 x i8> %i to <16 x i32> ret <16 x i32> %x } define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_2x8mem_to_2x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_2x8mem_to_2x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_2x8mem_to_2x64: ; SKX: # BB#0: -; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovzxbq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i8>,<2 x i8> *%i,align 1 %x = zext <2 x i8> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_2x8mem_to_2x64mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_2x8mem_to_2x64mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_2x8mem_to_2x64mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i8>,<2 x i8> *%i,align 1 %x = sext <2 x i8> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer ret <2 x i64> %ret } define <2 x i64> @sext_2x8mem_to_2x64(<2 x i8> *%i) nounwind readnone { -; CHECK-LABEL: sext_2x8mem_to_2x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxbq (%rdi), %xmm0 # sched: [6:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_2x8mem_to_2x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxbq (%rdi), %xmm0 # sched: [7:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_2x8mem_to_2x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbq (%rdi), %xmm0 # sched: [6:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i8>,<2 x i8> *%i,align 1 %x = sext <2 x i8> %a to <2 x i64> ret <2 x i64> %x } define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_4x8mem_to_4x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_4x8mem_to_4x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_4x8mem_to_4x64: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero -; SKX-NEXT: retq +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = zext <4 x i8> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer @@ -2725,18 +3519,19 @@ define <4 x i64> @zext_4x8mem_to_4x64(<4 x i8> *%i , <4 x i1> %mask) nounwind re } define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_4x8mem_to_4x64mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_4x8mem_to_4x64mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_4x8mem_to_4x64mask: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = sext <4 x i8> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer @@ -2744,28 +3539,34 @@ define <4 x i64> @sext_4x8mem_to_4x64mask(<4 x i8> *%i , <4 x i1> %mask) nounwin } define <4 x i64> @sext_4x8mem_to_4x64(<4 x i8> *%i) nounwind readnone { -; CHECK-LABEL: sext_4x8mem_to_4x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxbq (%rdi), %ymm0 # sched: [8:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_4x8mem_to_4x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxbq (%rdi), %ymm0 # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_4x8mem_to_4x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbq (%rdi), %ymm0 # sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i8>,<4 x i8> *%i,align 1 %x = sext <4 x i8> %a to <4 x i64> ret <4 x i64> %x } define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_8x8mem_to_8x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8x8mem_to_8x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k1 +; GENERIC-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_8x8mem_to_8x64: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxbq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = zext <8 x i8> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer @@ -2773,18 +3574,19 @@ define <8 x i64> @zext_8x8mem_to_8x64(<8 x i8> *%i , <8 x i1> %mask) nounwind re } define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_8x8mem_to_8x64mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8x8mem_to_8x64mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k1 +; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_8x8mem_to_8x64mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = sext <8 x i8> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer @@ -2792,28 +3594,34 @@ define <8 x i64> @sext_8x8mem_to_8x64mask(<8 x i8> *%i , <8 x i1> %mask) nounwin } define <8 x i64> @sext_8x8mem_to_8x64(<8 x i8> *%i) nounwind readnone { -; CHECK-LABEL: sext_8x8mem_to_8x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8x8mem_to_8x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_8x8mem_to_8x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i8>,<8 x i8> *%i,align 1 %x = sext <8 x i8> %a to <8 x i64> ret <8 x i64> %x } define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_4x16mem_to_4x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_4x16mem_to_4x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_4x16mem_to_4x32: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SKX-NEXT: retq +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovzxwd {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = zext <4 x i16> %a to <4 x i32> %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer @@ -2821,18 +3629,19 @@ define <4 x i32> @zext_4x16mem_to_4x32(<4 x i16> *%i , <4 x i1> %mask) nounwind } define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_4x16mem_to_4x32mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_4x16mem_to_4x32mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_4x16mem_to_4x32mask: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = sext <4 x i16> %a to <4 x i32> %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> zeroinitializer @@ -2840,10 +3649,15 @@ define <4 x i32> @sext_4x16mem_to_4x32mask(<4 x i16> *%i , <4 x i1> %mask) nounw } define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone { -; CHECK-LABEL: sext_4x16mem_to_4x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxwd (%rdi), %xmm0 # sched: [6:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_4x16mem_to_4x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxwd (%rdi), %xmm0 # sched: [7:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_4x16mem_to_4x32: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxwd (%rdi), %xmm0 # sched: [6:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = sext <4 x i16> %a to <4 x i32> ret <4 x i32> %x @@ -2851,18 +3665,19 @@ define <4 x i32> @sext_4x16mem_to_4x32(<4 x i16> *%i) nounwind readnone { define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_8x16mem_to_8x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8x16mem_to_8x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k1 +; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_8x16mem_to_8x32: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = zext <8 x i16> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer @@ -2870,18 +3685,19 @@ define <8 x i32> @zext_8x16mem_to_8x32(<8 x i16> *%i , <8 x i1> %mask) nounwind } define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_8x16mem_to_8x32mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8x16mem_to_8x32mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k1 +; GENERIC-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_8x16mem_to_8x32mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxwd (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = sext <8 x i16> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer @@ -2889,55 +3705,67 @@ define <8 x i32> @sext_8x16mem_to_8x32mask(<8 x i16> *%i , <8 x i1> %mask) nounw } define <8 x i32> @sext_8x16mem_to_8x32(<8 x i16> *%i) nounwind readnone { -; CHECK-LABEL: sext_8x16mem_to_8x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxwd (%rdi), %ymm0 # sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8x16mem_to_8x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxwd (%rdi), %ymm0 # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_8x16mem_to_8x32: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxwd (%rdi), %ymm0 # sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = sext <8 x i16> %a to <8 x i32> ret <8 x i32> %x } define <8 x i32> @zext_8x16_to_8x32mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_8x16_to_8x32mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8x16_to_8x32mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm1, %k1 +; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_8x16_to_8x32mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 -; SKX-NEXT: vpmovw2m %xmm1, %k1 -; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <8 x i16> %a to <8 x i32> %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> zeroinitializer ret <8 x i32> %ret } define <8 x i32> @zext_8x16_to_8x32(<8 x i16> %a ) nounwind readnone { -; CHECK-LABEL: zext_8x16_to_8x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8x16_to_8x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: zext_8x16_to_8x32: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <8 x i16> %a to <8 x i32> ret <8 x i32> %x } define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_16x16mem_to_16x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16x16mem_to_16x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm0, %k1 +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_16x16mem_to_16x32: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 -; SKX-NEXT: vpmovb2m %xmm0, %k1 -; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i16>,<16 x i16> *%i,align 1 %x = zext <16 x i16> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer @@ -2945,18 +3773,19 @@ define <16 x i32> @zext_16x16mem_to_16x32(<16 x i16> *%i , <16 x i1> %mask) noun } define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_16x16mem_to_16x32mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_16x16mem_to_16x32mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm0, %k1 +; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_16x16mem_to_16x32mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 -; SKX-NEXT: vpmovb2m %xmm0, %k1 -; SKX-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxwd (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i16>,<16 x i16> *%i,align 1 %x = sext <16 x i16> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer @@ -2964,54 +3793,66 @@ define <16 x i32> @sext_16x16mem_to_16x32mask(<16 x i16> *%i , <16 x i1> %mask) } define <16 x i32> @sext_16x16mem_to_16x32(<16 x i16> *%i) nounwind readnone { -; CHECK-LABEL: sext_16x16mem_to_16x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxwd (%rdi), %zmm0 # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_16x16mem_to_16x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxwd (%rdi), %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_16x16mem_to_16x32: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxwd (%rdi), %zmm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <16 x i16>,<16 x i16> *%i,align 1 %x = sext <16 x i16> %a to <16 x i32> ret <16 x i32> %x } define <16 x i32> @zext_16x16_to_16x32mask(<16 x i16> %a , <16 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_16x16_to_16x32mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16x16_to_16x32mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm1, %k1 +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_16x16_to_16x32mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 -; SKX-NEXT: vpmovb2m %xmm1, %k1 -; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <16 x i16> %a to <16 x i32> %ret = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer ret <16 x i32> %ret } define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %a ) nounwind readnone { -; CHECK-LABEL: zext_16x16_to_16x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16x16_to_16x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: zext_16x16_to_16x32: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <16 x i16> %a to <16 x i32> ret <16 x i32> %x } define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_2x16mem_to_2x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_2x16mem_to_2x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_2x16mem_to_2x64: ; SKX: # BB#0: -; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovzxwq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i16>,<2 x i16> *%i,align 1 %x = zext <2 x i16> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer @@ -3019,18 +3860,19 @@ define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind } define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_2x16mem_to_2x64mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_2x16mem_to_2x64mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_2x16mem_to_2x64mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i16>,<2 x i16> *%i,align 1 %x = sext <2 x i16> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer @@ -3038,28 +3880,34 @@ define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounw } define <2 x i64> @sext_2x16mem_to_2x64(<2 x i16> *%i) nounwind readnone { -; CHECK-LABEL: sext_2x16mem_to_2x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxwq (%rdi), %xmm0 # sched: [6:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_2x16mem_to_2x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxwq (%rdi), %xmm0 # sched: [7:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_2x16mem_to_2x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxwq (%rdi), %xmm0 # sched: [6:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i16>,<2 x i16> *%i,align 1 %x = sext <2 x i16> %a to <2 x i64> ret <2 x i64> %x } define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_4x16mem_to_4x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_4x16mem_to_4x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_4x16mem_to_4x64: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SKX-NEXT: retq +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = zext <4 x i16> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer @@ -3067,18 +3915,19 @@ define <4 x i64> @zext_4x16mem_to_4x64(<4 x i16> *%i , <4 x i1> %mask) nounwind } define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_4x16mem_to_4x64mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_4x16mem_to_4x64mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_4x16mem_to_4x64mask: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = sext <4 x i16> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer @@ -3086,28 +3935,34 @@ define <4 x i64> @sext_4x16mem_to_4x64mask(<4 x i16> *%i , <4 x i1> %mask) nounw } define <4 x i64> @sext_4x16mem_to_4x64(<4 x i16> *%i) nounwind readnone { -; CHECK-LABEL: sext_4x16mem_to_4x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxwq (%rdi), %ymm0 # sched: [8:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_4x16mem_to_4x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxwq (%rdi), %ymm0 # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_4x16mem_to_4x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxwq (%rdi), %ymm0 # sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i16>,<4 x i16> *%i,align 1 %x = sext <4 x i16> %a to <4 x i64> ret <4 x i64> %x } define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_8x16mem_to_8x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8x16mem_to_8x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k1 +; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_8x16mem_to_8x64: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = zext <8 x i16> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer @@ -3115,18 +3970,19 @@ define <8 x i64> @zext_8x16mem_to_8x64(<8 x i16> *%i , <8 x i1> %mask) nounwind } define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_8x16mem_to_8x64mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8x16mem_to_8x64mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k1 +; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_8x16mem_to_8x64mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxwq (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = sext <8 x i16> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer @@ -3134,55 +3990,67 @@ define <8 x i64> @sext_8x16mem_to_8x64mask(<8 x i16> *%i , <8 x i1> %mask) nounw } define <8 x i64> @sext_8x16mem_to_8x64(<8 x i16> *%i) nounwind readnone { -; CHECK-LABEL: sext_8x16mem_to_8x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxwq (%rdi), %zmm0 # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8x16mem_to_8x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxwq (%rdi), %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_8x16mem_to_8x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxwq (%rdi), %zmm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i16>,<8 x i16> *%i,align 1 %x = sext <8 x i16> %a to <8 x i64> ret <8 x i64> %x } define <8 x i64> @zext_8x16_to_8x64mask(<8 x i16> %a , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_8x16_to_8x64mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8x16_to_8x64mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm1, %k1 +; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_8x16_to_8x64mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 -; SKX-NEXT: vpmovw2m %xmm1, %k1 -; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <8 x i16> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %a) nounwind readnone { -; CHECK-LABEL: zext_8x16_to_8x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8x16_to_8x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: zext_8x16_to_8x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %ret = zext <8 x i16> %a to <8 x i64> ret <8 x i64> %ret } define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_2x32mem_to_2x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_2x32mem_to_2x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_2x32mem_to_2x64: ; SKX: # BB#0: -; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovzxdq {{.*#+}} xmm0 {%k1} {z} = mem[0],zero,mem[1],zero sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i32>,<2 x i32> *%i,align 1 %x = zext <2 x i32> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer @@ -3190,18 +4058,19 @@ define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind } define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_2x32mem_to_2x64mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_2x32mem_to_2x64mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_2x32mem_to_2x64mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 {%k1} {z} # sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i32>,<2 x i32> *%i,align 1 %x = sext <2 x i32> %a to <2 x i64> %ret = select <2 x i1> %mask, <2 x i64> %x, <2 x i64> zeroinitializer @@ -3209,28 +4078,34 @@ define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounw } define <2 x i64> @sext_2x32mem_to_2x64(<2 x i32> *%i) nounwind readnone { -; CHECK-LABEL: sext_2x32mem_to_2x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxdq (%rdi), %xmm0 # sched: [6:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_2x32mem_to_2x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxdq (%rdi), %xmm0 # sched: [7:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_2x32mem_to_2x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxdq (%rdi), %xmm0 # sched: [6:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <2 x i32>,<2 x i32> *%i,align 1 %x = sext <2 x i32> %a to <2 x i64> ret <2 x i64> %x } define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_4x32mem_to_4x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_4x32mem_to_4x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_4x32mem_to_4x64: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SKX-NEXT: retq +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i32>,<4 x i32> *%i,align 1 %x = zext <4 x i32> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer @@ -3238,18 +4113,19 @@ define <4 x i64> @zext_4x32mem_to_4x64(<4 x i32> *%i , <4 x i1> %mask) nounwind } define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_4x32mem_to_4x64mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_4x32mem_to_4x64mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_4x32mem_to_4x64mask: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i32>,<4 x i32> *%i,align 1 %x = sext <4 x i32> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer @@ -3257,55 +4133,67 @@ define <4 x i64> @sext_4x32mem_to_4x64mask(<4 x i32> *%i , <4 x i1> %mask) nounw } define <4 x i64> @sext_4x32mem_to_4x64(<4 x i32> *%i) nounwind readnone { -; CHECK-LABEL: sext_4x32mem_to_4x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxdq (%rdi), %ymm0 # sched: [9:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_4x32mem_to_4x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxdq (%rdi), %ymm0 # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_4x32mem_to_4x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxdq (%rdi), %ymm0 # sched: [9:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <4 x i32>,<4 x i32> *%i,align 1 %x = sext <4 x i32> %a to <4 x i64> ret <4 x i64> %x } define <4 x i64> @sext_4x32_to_4x64(<4 x i32> %a) nounwind readnone { -; CHECK-LABEL: sext_4x32_to_4x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_4x32_to_4x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_4x32_to_4x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxdq %xmm0, %ymm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = sext <4 x i32> %a to <4 x i64> ret <4 x i64> %x } define <4 x i64> @zext_4x32_to_4x64mask(<4 x i32> %a , <4 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_4x32_to_4x64mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_4x32_to_4x64mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm1, %xmm1, %k1 +; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_4x32_to_4x64mask: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm1, %xmm1 -; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 -; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SKX-NEXT: retq +; SKX-NEXT: vpslld $31, %xmm1, %xmm1 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm1, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <4 x i32> %a to <4 x i64> %ret = select <4 x i1> %mask, <4 x i64> %x, <4 x i64> zeroinitializer ret <4 x i64> %ret } define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_8x32mem_to_8x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8x32mem_to_8x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k1 +; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_8x32mem_to_8x64: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i32>,<8 x i32> *%i,align 1 %x = zext <8 x i32> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer @@ -3313,18 +4201,19 @@ define <8 x i64> @zext_8x32mem_to_8x64(<8 x i32> *%i , <8 x i1> %mask) nounwind } define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: sext_8x32mem_to_8x64mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8x32mem_to_8x64mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k1 +; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_8x32mem_to_8x64mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k1 -; SKX-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovsxdq (%rdi), %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i32>,<8 x i32> *%i,align 1 %x = sext <8 x i32> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer @@ -3332,150 +4221,176 @@ define <8 x i64> @sext_8x32mem_to_8x64mask(<8 x i32> *%i , <8 x i1> %mask) nounw } define <8 x i64> @sext_8x32mem_to_8x64(<8 x i32> *%i) nounwind readnone { -; CHECK-LABEL: sext_8x32mem_to_8x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxdq (%rdi), %zmm0 # sched: [10:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8x32mem_to_8x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxdq (%rdi), %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_8x32mem_to_8x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxdq (%rdi), %zmm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %a = load <8 x i32>,<8 x i32> *%i,align 1 %x = sext <8 x i32> %a to <8 x i64> ret <8 x i64> %x } define <8 x i64> @sext_8x32_to_8x64(<8 x i32> %a) nounwind readnone { -; CHECK-LABEL: sext_8x32_to_8x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8x32_to_8x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxdq %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: sext_8x32_to_8x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovsxdq %ymm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = sext <8 x i32> %a to <8 x i64> ret <8 x i64> %x } define <8 x i64> @zext_8x32_to_8x64mask(<8 x i32> %a , <8 x i1> %mask) nounwind readnone { -; CHECK-LABEL: zext_8x32_to_8x64mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8x32_to_8x64mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm1, %k1 +; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_8x32_to_8x64mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 -; SKX-NEXT: vpmovw2m %xmm1, %k1 -; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $15, %xmm1, %xmm1 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm1, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxdq {{.*#+}} zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = zext <8 x i32> %a to <8 x i64> %ret = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> zeroinitializer ret <8 x i64> %ret } define <8 x float> @fptrunc_test(<8 x double> %a) nounwind readnone { -; CHECK-LABEL: fptrunc_test: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtpd2ps %zmm0, %ymm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: fptrunc_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtpd2ps %zmm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: fptrunc_test: +; SKX: # BB#0: +; SKX-NEXT: vcvtpd2ps %zmm0, %ymm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = fptrunc <8 x double> %a to <8 x float> ret <8 x float> %b } define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone { -; CHECK-LABEL: fpext_test: -; CHECK: # BB#0: -; CHECK-NEXT: vcvtps2pd %ymm0, %zmm0 # sched: [7:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: fpext_test: +; GENERIC: # BB#0: +; GENERIC-NEXT: vcvtps2pd %ymm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: fpext_test: +; SKX: # BB#0: +; SKX-NEXT: vcvtps2pd %ymm0, %zmm0 # sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = fpext <8 x float> %a to <8 x double> ret <8 x double> %b } define <16 x i32> @zext_16i1_to_16xi32(i16 %b) { -; CHECK-LABEL: zext_16i1_to_16xi32: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16i1_to_16xi32: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_16i1_to_16xi32: ; SKX: # BB#0: -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i16 %b to <16 x i1> %c = zext <16 x i1> %a to <16 x i32> ret <16 x i32> %c } define <8 x i64> @zext_8i1_to_8xi64(i8 %b) { -; CHECK-LABEL: zext_8i1_to_8xi64: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8i1_to_8xi64: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_8i1_to_8xi64: ; SKX: # BB#0: -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i8 %b to <8 x i1> %c = zext <8 x i1> %a to <8 x i64> ret <8 x i64> %c } define i16 @trunc_16i8_to_16i1(<16 x i8> %a) { -; CHECK-LABEL: trunc_16i8_to_16i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AX %AX %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: trunc_16i8_to_16i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: trunc_16i8_to_16i1: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 -; SKX-NEXT: vpmovb2m %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: %AX %AX %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %mask_b = trunc <16 x i8>%a to <16 x i1> %mask = bitcast <16 x i1> %mask_b to i16 ret i16 %mask } define i16 @trunc_16i32_to_16i1(<16 x i32> %a) { -; CHECK-LABEL: trunc_16i32_to_16i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %zmm0, %zmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %zmm0, %zmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AX %AX %EAX -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: trunc_16i32_to_16i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %zmm0, %zmm0 +; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: trunc_16i32_to_16i1: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %zmm0, %zmm0 -; SKX-NEXT: vptestmd %zmm0, %zmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: vpslld $31, %zmm0, %zmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %zmm0, %zmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: %AX %AX %EAX -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %mask_b = trunc <16 x i32>%a to <16 x i1> %mask = bitcast <16 x i1> %mask_b to i16 ret i16 %mask } define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) { -; CHECK-LABEL: trunc_4i32_to_4i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vpmovm2d %k0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: trunc_4i32_to_4i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k1 +; GENERIC-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: trunc_4i32_to_4i1: ; SKX: # BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 -; SKX-NEXT: vpslld $31, %xmm1, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 {%k1} # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %mask_a = trunc <4 x i32>%a to <4 x i1> %mask_b = trunc <4 x i32>%b to <4 x i1> %a_and_b = and <4 x i1>%mask_a, %mask_b @@ -3485,36 +4400,38 @@ define <4 x i32> @trunc_4i32_to_4i1(<4 x i32> %a, <4 x i32> %b) { define i8 @trunc_8i16_to_8i1(<8 x i16> %a) { -; CHECK-LABEL: trunc_8i16_to_8i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AL %AL %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: trunc_8i16_to_8i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AL %AL %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: trunc_8i16_to_8i1: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: %AL %AL %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %mask_b = trunc <8 x i16>%a to <8 x i1> %mask = bitcast <8 x i1> %mask_b to i8 ret i8 %mask } define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { -; CHECK-LABEL: sext_8i1_8i32: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpled %ymm0, %ymm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2d %k0, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8i1_8i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; GENERIC-NEXT: vpmovm2d %k0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_8i1_8i32: ; SKX: # BB#0: -; SKX-NEXT: vpcmpled %ymm0, %ymm1, %k0 +; SKX-NEXT: vpcmpled %ymm0, %ymm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %ymm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %x = icmp slt <8 x i32> %a1, %a2 %x1 = xor <8 x i1>%x, %y = sext <8 x i1> %x1 to <8 x i32> @@ -3523,30 +4440,31 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind { define i16 @trunc_i32_to_i1(i32 %a) { -; CHECK-LABEL: trunc_i32_to_i1: -; CHECK: # BB#0: -; CHECK-NEXT: movw $-4, %ax # sched: [1:0.25] -; CHECK-NEXT: kmovd %eax, %k0 # sched: [1:1.00] -; CHECK-NEXT: kshiftrw $1, %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kshiftlw $1, %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: andl $1, %edi # sched: [1:0.25] -; CHECK-NEXT: kmovw %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AX %AX %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: trunc_i32_to_i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: movw $-4, %ax # sched: [1:0.33] +; GENERIC-NEXT: kmovd %eax, %k0 +; GENERIC-NEXT: kshiftrw $1, %k0, %k0 +; GENERIC-NEXT: kshiftlw $1, %k0, %k0 +; GENERIC-NEXT: andl $1, %edi # sched: [1:0.33] +; GENERIC-NEXT: kmovw %edi, %k1 +; GENERIC-NEXT: korw %k1, %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: trunc_i32_to_i1: ; SKX: # BB#0: -; SKX-NEXT: movw $-4, %ax -; SKX-NEXT: kmovd %eax, %k0 -; SKX-NEXT: kshiftrw $1, %k0, %k0 -; SKX-NEXT: kshiftlw $1, %k0, %k0 -; SKX-NEXT: andl $1, %edi -; SKX-NEXT: kmovw %edi, %k1 -; SKX-NEXT: korw %k1, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax +; SKX-NEXT: movw $-4, %ax # sched: [1:0.25] +; SKX-NEXT: kmovd %eax, %k0 # sched: [1:1.00] +; SKX-NEXT: kshiftrw $1, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kshiftlw $1, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: andl $1, %edi # sched: [1:0.25] +; SKX-NEXT: kmovw %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: korw %k1, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] ; SKX-NEXT: # kill: %AX %AX %EAX -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %a_i = trunc i32 %a to i1 %maskv = insertelement <16 x i1> , i1 %a_i, i32 0 %res = bitcast <16 x i1> %maskv to i16 @@ -3554,68 +4472,72 @@ define i16 @trunc_i32_to_i1(i32 %a) { } define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind { -; CHECK-LABEL: sext_8i1_8i16: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2w %k0, %xmm0 -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8i1_8i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 +; GENERIC-NEXT: vpmovm2w %k0, %xmm0 +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_8i1_8i16: ; SKX: # BB#0: -; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 +; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2w %k0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = icmp slt <8 x i32> %a1, %a2 %y = sext <8 x i1> %x to <8 x i16> ret <8 x i16> %y } define <16 x i32> @sext_16i1_16i32(<16 x i32> %a1, <16 x i32> %a2) nounwind { -; CHECK-LABEL: sext_16i1_16i32: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2d %k0, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_16i1_16i32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 +; GENERIC-NEXT: vpmovm2d %k0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_16i1_16i32: ; SKX: # BB#0: -; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 +; SKX-NEXT: vpcmpgtd %zmm0, %zmm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %x = icmp slt <16 x i32> %a1, %a2 %y = sext <16 x i1> %x to <16 x i32> ret <16 x i32> %y } define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind { -; CHECK-LABEL: sext_8i1_8i64: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2q %k0, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: sext_8i1_8i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 +; GENERIC-NEXT: vpmovm2q %k0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: sext_8i1_8i64: ; SKX: # BB#0: -; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 +; SKX-NEXT: vpcmpgtd %ymm0, %ymm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2q %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %x = icmp slt <8 x i32> %a1, %a2 %y = sext <8 x i1> %x to <8 x i64> ret <8 x i64> %y } define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { -; CHECK-LABEL: extload_v8i64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [10:1.00] -; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: extload_v8i64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovsxbq (%rdi), %zmm0 +; GENERIC-NEXT: vmovdqa64 %zmm0, (%rsi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: extload_v8i64: ; SKX: # BB#0: -; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 -; SKX-NEXT: vmovdqa64 %zmm0, (%rsi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vpmovsxbq (%rdi), %zmm0 # sched: [10:1.00] +; SKX-NEXT: vmovdqa64 %zmm0, (%rsi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %sign_load = load <8 x i8>, <8 x i8>* %a %c = sext <8 x i8> %sign_load to <8 x i64> store <8 x i64> %c, <8 x i64>* %res @@ -3623,49 +4545,56 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { } define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { -; CHECK-LABEL: test21: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %zmm2, %zmm2 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %zmm2, %k1 # sched: [1:1.00] -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: kshiftrq $32, %k1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z} -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test21: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %zmm2, %zmm2 +; GENERIC-NEXT: vpmovb2m %zmm2, %k1 +; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: kshiftrq $32, %k1, %k1 +; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test21: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %zmm2, %zmm2 -; SKX-NEXT: vpmovb2m %zmm2, %k1 +; SKX-NEXT: vpsllw $7, %zmm2, %zmm2 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %zmm2, %k1 # sched: [1:1.00] ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; SKX-NEXT: kshiftrq $32, %k1, %k1 +; SKX-NEXT: kshiftrq $32, %k1, %k1 # sched: [3:1.00] ; SKX-NEXT: vmovdqu16 %zmm1, %zmm1 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer ret <64 x i16> %ret } define <16 x i16> @shuffle_zext_16x8_to_16x16(<16 x i8> %a) nounwind readnone { -; CHECK-LABEL: shuffle_zext_16x8_to_16x16: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: shuffle_zext_16x8_to_16x16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: shuffle_zext_16x8_to_16x16: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <32 x i32> %2 = bitcast <32 x i8> %1 to <16 x i16> ret <16 x i16> %2 } define <16 x i16> @shuffle_zext_16x8_to_16x16_mask(<16 x i8> %a, <16 x i1> %mask) nounwind readnone { -; CHECK-LABEL: shuffle_zext_16x8_to_16x16_mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: shuffle_zext_16x8_to_16x16_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm1, %k1 +; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: shuffle_zext_16x8_to_16x16_mask: ; SKX: # BB#0: -; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 -; SKX-NEXT: vpmovb2m %xmm1, %k1 -; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; SKX-NEXT: retq +; SKX-NEXT: vpsllw $7, %xmm1, %xmm1 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm1, %k1 # sched: [1:1.00] +; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <32 x i32> %bc = bitcast <32 x i8> %x to <16 x i16> %ret = select <16 x i1> %mask, <16 x i16> %bc, <16 x i16> zeroinitializer @@ -3673,108 +4602,141 @@ define <16 x i16> @shuffle_zext_16x8_to_16x16_mask(<16 x i8> %a, <16 x i1> %mask } define <16 x i16> @zext_32x8_to_16x16(<32 x i8> %a) { -; CHECK-LABEL: zext_32x8_to_16x16: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_32x8_to_16x16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: zext_32x8_to_16x16: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> %2 = bitcast <32 x i8> %1 to <16 x i16> ret <16 x i16> %2 } define <8 x i32> @zext_32x8_to_8x32(<32 x i8> %a) { -; CHECK-LABEL: zext_32x8_to_8x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_32x8_to_8x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: zext_32x8_to_8x32: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> %2 = bitcast <32 x i8> %1 to <8 x i32> ret <8 x i32> %2 } define <4 x i64> @zext_32x8_to_4x64(<32 x i8> %a) { -; CHECK-LABEL: zext_32x8_to_4x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_32x8_to_4x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: zext_32x8_to_4x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> %2 = bitcast <32 x i8> %1 to <4 x i64> ret <4 x i64> %2 } define <8 x i32> @zext_16x16_to_8x32(<16 x i16> %a) { -; CHECK-LABEL: zext_16x16_to_8x32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16x16_to_8x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: zext_16x16_to_8x32: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> %2 = bitcast <16 x i16> %1 to <8 x i32> ret <8 x i32> %2 } define <4 x i64> @zext_16x16_to_4x64(<16 x i16> %a) { -; CHECK-LABEL: zext_16x16_to_4x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16x16_to_4x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: zext_16x16_to_4x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <16 x i16> %a, <16 x i16> zeroinitializer, <16 x i32> %2 = bitcast <16 x i16> %1 to <4 x i64> ret <4 x i64> %2 } define <4 x i64> @zext_8x32_to_4x64(<8 x i32> %a) { -; CHECK-LABEL: zext_8x32_to_4x64: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_8x32_to_4x64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: zext_8x32_to_4x64: +; SKX: # BB#0: +; SKX-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %1 = shufflevector <8 x i32> %a, <8 x i32> zeroinitializer, <8 x i32> %2 = bitcast <8 x i32> %1 to <4 x i64> ret <4 x i64> %2 } define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 { -; CHECK-LABEL: zext_64xi1_to_64xi8: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_64xi1_to_64xi8: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_64xi1_to_64xi8: ; SKX: # BB#0: -; SKX-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 -; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <64 x i8> %x, %y %1 = zext <64 x i1> %mask to <64 x i8> ret <64 x i8> %1 } define <32 x i16> @zext_32xi1_to_32xi16(<32 x i16> %x, <32 x i16> %y) #0 { -; CHECK-LABEL: zext_32xi1_to_32xi16: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_32xi1_to_32xi16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_32xi1_to_32xi16: ; SKX: # BB#0: -; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <32 x i16> %x, %y %1 = zext <32 x i1> %mask to <32 x i16> ret <32 x i16> %1 } define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 { -; CHECK-LABEL: zext_16xi1_to_16xi16: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_16xi1_to_16xi16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_16xi1_to_16xi16: ; SKX: # BB#0: -; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 -; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqu16 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <16 x i16> %x, %y %1 = zext <16 x i1> %mask to <16 x i16> ret <16 x i16> %1 @@ -3782,105 +4744,133 @@ define <16 x i16> @zext_16xi1_to_16xi16(<16 x i16> %x, <16 x i16> %y) #0 { define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 { -; CHECK-LABEL: zext_32xi1_to_32xi8: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_32xi1_to_32xi8: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_32xi1_to_32xi8: ; SKX: # BB#0: -; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 -; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <32 x i16> %x, %y %1 = zext <32 x i1> %mask to <32 x i8> ret <32 x i8> %1 } define <4 x i32> @zext_4xi1_to_4x32(<4 x i8> %x, <4 x i8> %y) #0 { -; CHECK-LABEL: zext_4xi1_to_4x32: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50] -; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_4xi1_to_4x32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50] +; GENERIC-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_4xi1_to_4x32: ; SKX: # BB#0: -; SKX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SKX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 -; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] sched: [6:0.50] +; SKX-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <4 x i8> %x, %y %1 = zext <4 x i1> %mask to <4 x i32> ret <4 x i32> %1 } define <2 x i64> @zext_2xi1_to_2xi64(<2 x i8> %x, <2 x i8> %y) #0 { -; CHECK-LABEL: zext_2xi1_to_2xi64: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50] -; CHECK-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_2xi1_to_2xi64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50] +; GENERIC-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_2xi1_to_2xi64: ; SKX: # BB#0: -; SKX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SKX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 -; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] sched: [6:0.50] +; SKX-NEXT: vpand %xmm2, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpand %xmm2, %xmm0, %xmm0 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 {{.*}}(%rip), %xmm0 {%k1} {z} # sched: [7:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp eq <2 x i8> %x, %y %1 = zext <2 x i1> %mask to <2 x i64> ret <2 x i64> %1 } define <16 x float> @test_x86_fmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { -; CHECK-LABEL: test_x86_fmadd_ps_z: -; CHECK: # BB#0: -; CHECK-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_x86_fmadd_ps_z: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vaddps %zmm2, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_x86_fmadd_ps_z: +; SKX: # BB#0: +; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: vaddps %zmm2, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %x = fmul <16 x float> %a0, %a1 %res = fadd <16 x float> %x, %a2 ret <16 x float> %res } define <16 x float> @test_x86_fmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { -; CHECK-LABEL: test_x86_fmsub_ps_z: -; CHECK: # BB#0: -; CHECK-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: vsubps %zmm2, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_x86_fmsub_ps_z: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vsubps %zmm2, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_x86_fmsub_ps_z: +; SKX: # BB#0: +; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: vsubps %zmm2, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %x = fmul <16 x float> %a0, %a1 %res = fsub <16 x float> %x, %a2 ret <16 x float> %res } define <16 x float> @test_x86_fnmadd_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { -; CHECK-LABEL: test_x86_fnmadd_ps_z: -; CHECK: # BB#0: -; CHECK-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: vsubps %zmm0, %zmm2, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_x86_fnmadd_ps_z: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vsubps %zmm0, %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_x86_fnmadd_ps_z: +; SKX: # BB#0: +; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: vsubps %zmm0, %zmm2, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %x = fmul <16 x float> %a0, %a1 %res = fsub <16 x float> %a2, %x ret <16 x float> %res } define <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { -; CHECK-LABEL: test_x86_fnmsub_ps_z: -; CHECK: # BB#0: -; CHECK-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: vsubps %zmm2, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_x86_fnmsub_ps_z: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; GENERIC-NEXT: vsubps %zmm2, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_x86_fnmsub_ps_z: +; SKX: # BB#0: +; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: vsubps %zmm2, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %x = fmul <16 x float> %a0, %a1 %y = fsub <16 x float> @test_x86_fnmsub_ps_z(<16 x float> %a0, <16 x float> %a1, <1 } define <8 x double> @test_x86_fmadd_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { -; CHECK-LABEL: test_x86_fmadd_pd_z: -; CHECK: # BB#0: -; CHECK-NEXT: vmulpd %zmm1, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_x86_fmadd_pd_z: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmulpd %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vaddpd %zmm2, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_x86_fmadd_pd_z: +; SKX: # BB#0: +; SKX-NEXT: vmulpd %zmm1, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: vaddpd %zmm2, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %x = fmul <8 x double> %a0, %a1 %res = fadd <8 x double> %x, %a2 ret <8 x double> %res } define <8 x double> @test_x86_fmsub_pd_z(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) { -; CHECK-LABEL: test_x86_fmsub_pd_z: -; CHECK: # BB#0: -; CHECK-NEXT: vmulpd %zmm1, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: vsubpd %zmm2, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_x86_fmsub_pd_z: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmulpd %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vsubpd %zmm2, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_x86_fmsub_pd_z: +; SKX: # BB#0: +; SKX-NEXT: vmulpd %zmm1, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: vsubpd %zmm2, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %x = fmul <8 x double> %a0, %a1 %res = fsub <8 x double> %x, %a2 ret <8 x double> %res } define double @test_x86_fmsub_213(double %a0, double %a1, double %a2) { -; CHECK-LABEL: test_x86_fmsub_213: -; CHECK: # BB#0: -; CHECK-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: vsubsd %xmm2, %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_x86_fmsub_213: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; GENERIC-NEXT: vsubsd %xmm2, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_x86_fmsub_213: +; SKX: # BB#0: +; SKX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: vsubsd %xmm2, %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %x = fmul double %a0, %a1 %res = fsub double %x, %a2 ret double %res } define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) { -; CHECK-LABEL: test_x86_fmsub_213_m: -; CHECK: # BB#0: -; CHECK-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_x86_fmsub_213_m: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [5:1.00] +; GENERIC-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_x86_fmsub_213_m: +; SKX: # BB#0: +; SKX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: vsubsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %a2 = load double , double *%a2_ptr %x = fmul double %a0, %a1 %res = fsub double %x, %a2 @@ -3936,11 +4950,17 @@ define double @test_x86_fmsub_213_m(double %a0, double %a1, double * %a2_ptr) { } define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) { -; CHECK-LABEL: test_x86_fmsub_231_m: -; CHECK: # BB#0: -; CHECK-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50] -; CHECK-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_x86_fmsub_231_m: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [11:1.00] +; GENERIC-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_x86_fmsub_231_m: +; SKX: # BB#0: +; SKX-NEXT: vmulsd (%rdi), %xmm0, %xmm0 # sched: [9:0.50] +; SKX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %a2 = load double , double *%a2_ptr %x = fmul double %a0, %a2 %res = fsub double %x, %a1 @@ -3948,22 +4968,34 @@ define double @test_x86_fmsub_231_m(double %a0, double %a1, double * %a2_ptr) { } define <16 x float> @test231_br(<16 x float> %a1, <16 x float> %a2) nounwind { -; CHECK-LABEL: test231_br: -; CHECK: # BB#0: -; CHECK-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [11:0.50] -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test231_br: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; GENERIC-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test231_br: +; SKX: # BB#0: +; SKX-NEXT: vmulps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [11:0.50] +; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b1 = fmul <16 x float> %a1, %b2 = fadd <16 x float> %b1, %a2 ret <16 x float> %b2 } define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind { -; CHECK-LABEL: test213_br: -; CHECK: # BB#0: -; CHECK-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [11:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test213_br: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmulps %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test213_br: +; SKX: # BB#0: +; SKX-NEXT: vmulps %zmm1, %zmm0, %zmm0 # sched: [4:0.33] +; SKX-NEXT: vaddps {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [11:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %b1 = fmul <16 x float> %a1, %a2 %b2 = fadd <16 x float> %b1, ret <16 x float> %b2 @@ -3971,19 +5003,21 @@ define <16 x float> @test213_br(<16 x float> %a1, <16 x float> %a2) nounwind { ;mask (a*c+b , a) define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) { -; CHECK-LABEL: test_x86_fmadd132_ps: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:1.00] -; CHECK-NEXT: vmulps (%rdi), %zmm0, %zmm2 # sched: [11:0.50] -; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm0 {%k1} # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_x86_fmadd132_ps: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm2, %k1 +; GENERIC-NEXT: vmulps (%rdi), %zmm0, %zmm2 +; GENERIC-NEXT: vaddps %zmm1, %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_x86_fmadd132_ps: -; SKX: ## BB#0: -; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 -; SKX-NEXT: vpmovb2m %xmm2, %k1 -; SKX-NEXT: vfmadd132ps (%rdi), %zmm1, %zmm0 {%k1} -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:1.00] +; SKX-NEXT: vmulps (%rdi), %zmm0, %zmm2 # sched: [11:0.50] +; SKX-NEXT: vaddps %zmm1, %zmm2, %zmm0 {%k1} # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1 %x = fmul <16 x float> %a0, %a2 %y = fadd <16 x float> %x, %a1 @@ -3993,21 +5027,23 @@ define <16 x float> @test_x86_fmadd132_ps(<16 x float> %a0, <16 x float> %a1, <1 ;mask (a*c+b , b) define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) { -; CHECK-LABEL: test_x86_fmadd231_ps: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:1.00] -; CHECK-NEXT: vmulps (%rdi), %zmm0, %zmm0 # sched: [11:0.50] -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm1 {%k1} # sched: [4:0.33] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_x86_fmadd231_ps: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm2, %k1 +; GENERIC-NEXT: vmulps (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: vaddps %zmm1, %zmm0, %zmm1 {%k1} +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_x86_fmadd231_ps: -; SKX: ## BB#0: -; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 -; SKX-NEXT: vpmovb2m %xmm2, %k1 -; SKX-NEXT: vfmadd231ps (%rdi), %zmm0, %zmm1 {%k1} -; SKX-NEXT: vmovaps %zmm1, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:1.00] +; SKX-NEXT: vmulps (%rdi), %zmm0, %zmm0 # sched: [11:0.50] +; SKX-NEXT: vaddps %zmm1, %zmm0, %zmm1 {%k1} # sched: [4:0.33] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1 %x = fmul <16 x float> %a0, %a2 %y = fadd <16 x float> %x, %a1 @@ -4017,21 +5053,23 @@ define <16 x float> @test_x86_fmadd231_ps(<16 x float> %a0, <16 x float> %a1, <1 ;mask (b*a+c , b) define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <16 x float> *%a2_ptrt, <16 x i1> %mask) { -; CHECK-LABEL: test_x86_fmadd213_ps: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:1.00] -; CHECK-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: vaddps (%rdi), %zmm0, %zmm1 {%k1} # sched: [11:0.50] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_x86_fmadd213_ps: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm2, %k1 +; GENERIC-NEXT: vmulps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: vaddps (%rdi), %zmm0, %zmm1 {%k1} +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_x86_fmadd213_ps: -; SKX: ## BB#0: -; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 -; SKX-NEXT: vpmovb2m %xmm2, %k1 -; SKX-NEXT: vfmadd213ps (%rdi), %zmm0, %zmm1 {%k1} -; SKX-NEXT: vmovaps %zmm1, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllw $7, %xmm2, %xmm2 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm2, %k1 # sched: [1:1.00] +; SKX-NEXT: vmulps %zmm0, %zmm1, %zmm0 # sched: [4:0.33] +; SKX-NEXT: vaddps (%rdi), %zmm0, %zmm1 {%k1} # sched: [11:0.50] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %a2 = load <16 x float>,<16 x float> *%a2_ptrt,align 1 %x = fmul <16 x float> %a1, %a0 %y = fadd <16 x float> %x, %a2 @@ -4040,11 +5078,17 @@ define <16 x float> @test_x86_fmadd213_ps(<16 x float> %a0, <16 x float> %a1, <1 } define <16 x i32> @vpandd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { -; CHECK-LABEL: vpandd: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpandd: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpandd: +; SKX: # BB#0: # %entry +; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <16 x i32> %a, @vpandnd(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { -; CHECK-LABEL: vpandnd: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpandnd: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpandnd: +; SKX: # BB#0: # %entry +; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <16 x i32> %a, @vpord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { -; CHECK-LABEL: vpord: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpord: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpord: +; SKX: # BB#0: # %entry +; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <16 x i32> %a, @vpxord(<16 x i32> %a, <16 x i32> %b) nounwind uwtable readnone ssp { -; CHECK-LABEL: vpxord: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpxord: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 +; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpxord: +; SKX: # BB#0: # %entry +; SKX-NEXT: vpaddd {{.*}}(%rip){1to16}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <16 x i32> %a, @vpandq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { -; CHECK-LABEL: vpandq: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpandq: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; GENERIC-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpandq: +; SKX: # BB#0: # %entry +; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <8 x i64> %a, @@ -4111,11 +5179,17 @@ entry: } define <8 x i64> @vpandnq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { -; CHECK-LABEL: vpandnq: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpandnq: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; GENERIC-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpandnq: +; SKX: # BB#0: # %entry +; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <8 x i64> %a, @@ -4125,11 +5199,17 @@ entry: } define <8 x i64> @vporq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { -; CHECK-LABEL: vporq: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vporq: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; GENERIC-NEXT: vporq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vporq: +; SKX: # BB#0: # %entry +; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <8 x i64> %a, @@ -4138,11 +5218,17 @@ entry: } define <8 x i64> @vpxorq(<8 x i64> %a, <8 x i64> %b) nounwind uwtable readnone ssp { -; CHECK-LABEL: vpxorq: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] -; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpxorq: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; GENERIC-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vpxorq: +; SKX: # BB#0: # %entry +; SKX-NEXT: vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0 # sched: [8:0.50] +; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: ; Force the execution domain with an add. %a2 = add <8 x i64> %a, @@ -4151,27 +5237,29 @@ entry: } define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) { -; CHECK-LABEL: and_v64i8: -; CHECK: # BB#0: -; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: and_v64i8: +; GENERIC: # BB#0: +; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: and_v64i8: -; SKX: ## BB#0: -; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %res = and <64 x i8> %a, %b ret <64 x i8> %res } define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) { -; CHECK-LABEL: andn_v64i8: -; CHECK: # BB#0: -; CHECK-NEXT: vandnps %zmm0, %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: andn_v64i8: +; GENERIC: # BB#0: +; GENERIC-NEXT: vandnps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: andn_v64i8: -; SKX: ## BB#0: -; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b2 = xor <64 x i8> %b, @andn_v64i8(<64 x i8> %a, <64 x i8> %b) { } define <64 x i8> @or_v64i8(<64 x i8> %a, <64 x i8> %b) { -; CHECK-LABEL: or_v64i8: -; CHECK: # BB#0: -; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: or_v64i8: +; GENERIC: # BB#0: +; GENERIC-NEXT: vorps %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: or_v64i8: -; SKX: ## BB#0: -; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %res = or <64 x i8> %a, %b ret <64 x i8> %res } define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) { -; CHECK-LABEL: xor_v64i8: -; CHECK: # BB#0: -; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: xor_v64i8: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorps %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: xor_v64i8: -; SKX: ## BB#0: -; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %res = xor <64 x i8> %a, %b ret <64 x i8> %res } define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) { -; CHECK-LABEL: and_v32i16: -; CHECK: # BB#0: -; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: and_v32i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: and_v32i16: -; SKX: ## BB#0: -; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %res = and <32 x i16> %a, %b ret <32 x i16> %res } define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) { -; CHECK-LABEL: andn_v32i16: -; CHECK: # BB#0: -; CHECK-NEXT: vandnps %zmm0, %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: andn_v32i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vandnps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: andn_v32i16: -; SKX: ## BB#0: -; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %b2 = xor <32 x i16> %b, %res = and <32 x i16> %a, %b2 @@ -4235,44 +5327,47 @@ define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) { } define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) { -; CHECK-LABEL: or_v32i16: -; CHECK: # BB#0: -; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: or_v32i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vorps %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: or_v32i16: -; SKX: ## BB#0: -; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %res = or <32 x i16> %a, %b ret <32 x i16> %res } define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) { -; CHECK-LABEL: xor_v32i16: -; CHECK: # BB#0: -; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: xor_v32i16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorps %zmm1, %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: xor_v32i16: -; SKX: ## BB#0: -; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %res = xor <32 x i16> %a, %b ret <32 x i16> %res } define <16 x float> @masked_and_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) { -; CHECK-LABEL: masked_and_v16f32: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] -; CHECK-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: masked_and_v16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} +; GENERIC-NEXT: vaddps %zmm2, %zmm3, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: masked_and_v16f32: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} -; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] +; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %a1 = bitcast <16 x float> %a to <16 x i32> %b1 = bitcast <16 x float> %b to <16 x i32> %passThru1 = bitcast <16 x float> %passThru to <16 x i32> @@ -4285,18 +5380,19 @@ define <16 x float> @masked_and_v16f32(<16 x float> %a, <16 x float> %b, <16 x f } define <16 x float> @masked_or_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) { -; CHECK-LABEL: masked_or_v16f32: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] -; CHECK-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: masked_or_v16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} +; GENERIC-NEXT: vaddps %zmm2, %zmm3, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: masked_or_v16f32: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} -; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] +; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %a1 = bitcast <16 x float> %a to <16 x i32> %b1 = bitcast <16 x float> %b to <16 x i32> %passThru1 = bitcast <16 x float> %passThru to <16 x i32> @@ -4309,18 +5405,19 @@ define <16 x float> @masked_or_v16f32(<16 x float> %a, <16 x float> %b, <16 x fl } define <16 x float> @masked_xor_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %passThru, i16 %mask, <16 x float> %c) { -; CHECK-LABEL: masked_xor_v16f32: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] -; CHECK-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: masked_xor_v16f32: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} +; GENERIC-NEXT: vaddps %zmm2, %zmm3, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: masked_xor_v16f32: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} -; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandps %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] +; SKX-NEXT: vaddps %zmm2, %zmm3, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %a1 = bitcast <16 x float> %a to <16 x i32> %b1 = bitcast <16 x float> %b to <16 x i32> %passThru1 = bitcast <16 x float> %passThru to <16 x i32> @@ -4333,18 +5430,19 @@ define <16 x float> @masked_xor_v16f32(<16 x float> %a, <16 x float> %b, <16 x f } define <8 x double> @masked_and_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) { -; CHECK-LABEL: masked_and_v8f64: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] -; CHECK-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: masked_and_v8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} +; GENERIC-NEXT: vaddpd %zmm2, %zmm3, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: masked_and_v8f64: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} -; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] +; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %a1 = bitcast <8 x double> %a to <8 x i64> %b1 = bitcast <8 x double> %b to <8 x i64> %passThru1 = bitcast <8 x double> %passThru to <8 x i64> @@ -4357,18 +5455,19 @@ define <8 x double> @masked_and_v8f64(<8 x double> %a, <8 x double> %b, <8 x dou } define <8 x double> @masked_or_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) { -; CHECK-LABEL: masked_or_v8f64: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] -; CHECK-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: masked_or_v8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} +; GENERIC-NEXT: vaddpd %zmm2, %zmm3, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: masked_or_v8f64: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} -; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] +; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %a1 = bitcast <8 x double> %a to <8 x i64> %b1 = bitcast <8 x double> %b to <8 x i64> %passThru1 = bitcast <8 x double> %passThru to <8 x i64> @@ -4381,18 +5480,19 @@ define <8 x double> @masked_or_v8f64(<8 x double> %a, <8 x double> %b, <8 x doub } define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %passThru, i8 %mask, <8 x double> %c) { -; CHECK-LABEL: masked_xor_v8f64: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] -; CHECK-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [4:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: masked_xor_v8f64: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} +; GENERIC-NEXT: vaddpd %zmm2, %zmm3, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: masked_xor_v8f64: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} -; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandpd %zmm1, %zmm0, %zmm2 {%k1} # sched: [1:0.33] +; SKX-NEXT: vaddpd %zmm2, %zmm3, %zmm0 # sched: [4:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %a1 = bitcast <8 x double> %a to <8 x i64> %b1 = bitcast <8 x double> %b to <8 x i64> %passThru1 = bitcast <8 x double> %passThru to <8 x i64> @@ -4405,16 +5505,17 @@ define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x dou } define <8 x i64> @test_mm512_mask_and_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { -; CHECK-LABEL: test_mm512_mask_and_epi32: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_mask_and_epi32: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_mask_and_epi32: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %and1.i.i = and <8 x i64> %__a, %__b %0 = bitcast <8 x i64> %and1.i.i to <16 x i32> @@ -4426,16 +5527,17 @@ entry: } define <8 x i64> @test_mm512_mask_or_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { -; CHECK-LABEL: test_mm512_mask_or_epi32: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_mask_or_epi32: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_mask_or_epi32: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %or1.i.i = or <8 x i64> %__a, %__b %0 = bitcast <8 x i64> %or1.i.i to <16 x i32> @@ -4447,16 +5549,17 @@ entry: } define <8 x i64> @test_mm512_mask_xor_epi32(<8 x i64> %__src, i16 zeroext %__k, <8 x i64> %__a, <8 x i64> %__b) { -; CHECK-LABEL: test_mm512_mask_xor_epi32: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_mask_xor_epi32: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_mask_xor_epi32: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %xor1.i.i = xor <8 x i64> %__a, %__b %0 = bitcast <8 x i64> %xor1.i.i to <16 x i32> @@ -4468,16 +5571,17 @@ entry: } define <8 x double> @test_mm512_mask_xor_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { -; CHECK-LABEL: test_mm512_mask_xor_pd: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_mask_xor_pd: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_mask_xor_pd: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vxorpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %1 = bitcast <8 x double> %__B to <8 x i64> @@ -4489,16 +5593,17 @@ entry: } define <8 x double> @test_mm512_maskz_xor_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { -; CHECK-LABEL: test_mm512_maskz_xor_pd: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_maskz_xor_pd: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_maskz_xor_pd: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %1 = bitcast <8 x double> %__B to <8 x i64> @@ -4510,16 +5615,17 @@ entry: } define <16 x float> @test_mm512_mask_xor_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { -; CHECK-LABEL: test_mm512_mask_xor_ps: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_mask_xor_ps: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_mask_xor_ps: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vxorps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %1 = bitcast <16 x float> %__B to <16 x i32> @@ -4531,16 +5637,17 @@ entry: } define <16 x float> @test_mm512_maskz_xor_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { -; CHECK-LABEL: test_mm512_maskz_xor_ps: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_maskz_xor_ps: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_maskz_xor_ps: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %1 = bitcast <16 x float> %__B to <16 x i32> @@ -4552,16 +5659,17 @@ entry: } define <8 x double> @test_mm512_mask_or_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { -; CHECK-LABEL: test_mm512_mask_or_pd: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_mask_or_pd: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_mask_or_pd: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vorpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %1 = bitcast <8 x double> %__B to <8 x i64> @@ -4573,16 +5681,17 @@ entry: } define <8 x double> @test_mm512_maskz_or_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { -; CHECK-LABEL: test_mm512_maskz_or_pd: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_maskz_or_pd: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_maskz_or_pd: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vorpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %1 = bitcast <8 x double> %__B to <8 x i64> @@ -4594,16 +5703,17 @@ entry: } define <16 x float> @test_mm512_mask_or_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { -; CHECK-LABEL: test_mm512_mask_or_ps: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_mask_or_ps: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_mask_or_ps: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vorps %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %1 = bitcast <16 x float> %__B to <16 x i32> @@ -4615,16 +5725,17 @@ entry: } define <16 x float> @test_mm512_maskz_or_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { -; CHECK-LABEL: test_mm512_maskz_or_ps: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_maskz_or_ps: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_maskz_or_ps: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vorps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %1 = bitcast <16 x float> %__B to <16 x i32> @@ -4636,16 +5747,17 @@ entry: } define <8 x double> @test_mm512_mask_and_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { -; CHECK-LABEL: test_mm512_mask_and_pd: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_mask_and_pd: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_mask_and_pd: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandpd %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %1 = bitcast <8 x double> %__B to <8 x i64> @@ -4657,16 +5769,17 @@ entry: } define <8 x double> @test_mm512_maskz_and_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { -; CHECK-LABEL: test_mm512_maskz_and_pd: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_maskz_and_pd: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_maskz_and_pd: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandpd %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %1 = bitcast <8 x double> %__B to <8 x i64> @@ -4678,16 +5791,17 @@ entry: } define <16 x float> @test_mm512_mask_and_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { -; CHECK-LABEL: test_mm512_mask_and_ps: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_mask_and_ps: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_mask_and_ps: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandps %zmm1, %zmm2, %zmm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %1 = bitcast <16 x float> %__B to <16 x i32> @@ -4699,16 +5813,17 @@ entry: } define <16 x float> @test_mm512_maskz_and_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { -; CHECK-LABEL: test_mm512_maskz_and_ps: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_maskz_and_ps: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_maskz_and_ps: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandps %zmm0, %zmm1, %zmm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %1 = bitcast <16 x float> %__B to <16 x i32> @@ -4720,16 +5835,17 @@ entry: } define <8 x double> @test_mm512_mask_andnot_pd(<8 x double> %__W, i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { -; CHECK-LABEL: test_mm512_mask_andnot_pd: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_mask_andnot_pd: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_mask_andnot_pd: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandnpd %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %neg.i.i = xor <8 x i64> %0, @@ -4742,16 +5858,17 @@ entry: } define <8 x double> @test_mm512_maskz_andnot_pd(i8 zeroext %__U, <8 x double> %__A, <8 x double> %__B) { -; CHECK-LABEL: test_mm512_maskz_andnot_pd: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_maskz_andnot_pd: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_maskz_andnot_pd: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <8 x double> %__A to <8 x i64> %neg.i.i = xor <8 x i64> %0, @@ -4764,16 +5881,17 @@ entry: } define <16 x float> @test_mm512_mask_andnot_ps(<16 x float> %__W, i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { -; CHECK-LABEL: test_mm512_mask_andnot_ps: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_mask_andnot_ps: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_mask_andnot_ps: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandnps %zmm2, %zmm1, %zmm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %neg.i.i = xor <16 x i32> %0, @@ -4786,16 +5904,17 @@ entry: } define <16 x float> @test_mm512_maskz_andnot_ps(i16 zeroext %__U, <16 x float> %__A, <16 x float> %__B) { -; CHECK-LABEL: test_mm512_maskz_andnot_ps: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_maskz_andnot_ps: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_mm512_maskz_andnot_ps: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kmovd %edi, %k1 -; SKX-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: vandnps %zmm1, %zmm0, %zmm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = bitcast <16 x float> %__A to <16 x i32> %neg.i.i = xor <16 x i32> %0, @@ -4808,321 +5927,491 @@ entry: } define i32 @mov_test1(float %x) { -; CHECK-LABEL: mov_test1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovd %xmm0, %eax # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovd %xmm0, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test1: +; SKX: # BB#0: +; SKX-NEXT: vmovd %xmm0, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %res = bitcast float %x to i32 ret i32 %res } define <4 x i32> @mov_test2(i32 %x) { -; CHECK-LABEL: mov_test2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test2: +; SKX: # BB#0: +; SKX-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = insertelement <4 x i32>undef, i32 %x, i32 0 ret <4 x i32>%res } define <2 x i64> @mov_test3(i64 %x) { -; CHECK-LABEL: mov_test3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test3: +; SKX: # BB#0: +; SKX-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = insertelement <2 x i64>undef, i64 %x, i32 0 ret <2 x i64>%res } define <4 x i32> @mov_test4(i32* %x) { -; CHECK-LABEL: mov_test4: -; CHECK: # BB#0: -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test4: +; SKX: # BB#0: +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %y = load i32, i32* %x %res = insertelement <4 x i32>undef, i32 %y, i32 0 ret <4 x i32>%res } define void @mov_test5(float %x, float* %y) { -; CHECK-LABEL: mov_test5: -; CHECK: # BB#0: -; CHECK-NEXT: vmovss %xmm0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovss %xmm0, (%rdi) # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test5: +; SKX: # BB#0: +; SKX-NEXT: vmovss %xmm0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] store float %x, float* %y, align 4 ret void } define void @mov_test6(double %x, double* %y) { -; CHECK-LABEL: mov_test6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovsd %xmm0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovsd %xmm0, (%rdi) # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test6: +; SKX: # BB#0: +; SKX-NEXT: vmovsd %xmm0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] store double %x, double* %y, align 8 ret void } define float @mov_test7(i32* %x) { -; CHECK-LABEL: mov_test7: -; CHECK: # BB#0: -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test7: +; SKX: # BB#0: +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %y = load i32, i32* %x %res = bitcast i32 %y to float ret float %res } define i32 @mov_test8(<4 x i32> %x) { -; CHECK-LABEL: mov_test8: -; CHECK: # BB#0: -; CHECK-NEXT: vmovd %xmm0, %eax # sched: [2:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test8: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovd %xmm0, %eax # sched: [2:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test8: +; SKX: # BB#0: +; SKX-NEXT: vmovd %xmm0, %eax # sched: [2:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = extractelement <4 x i32> %x, i32 0 ret i32 %res } define i64 @mov_test9(<2 x i64> %x) { -; CHECK-LABEL: mov_test9: -; CHECK: # BB#0: -; CHECK-NEXT: vmovq %xmm0, %rax # sched: [2:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test9: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovq %xmm0, %rax # sched: [2:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test9: +; SKX: # BB#0: +; SKX-NEXT: vmovq %xmm0, %rax # sched: [2:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = extractelement <2 x i64> %x, i32 0 ret i64 %res } define <4 x i32> @mov_test10(i32* %x) { -; CHECK-LABEL: mov_test10: -; CHECK: # BB#0: -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test10: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test10: +; SKX: # BB#0: +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %y = load i32, i32* %x, align 4 %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0 ret <4 x i32>%res } define <4 x float> @mov_test11(float* %x) { -; CHECK-LABEL: mov_test11: -; CHECK: # BB#0: -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test11: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test11: +; SKX: # BB#0: +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %y = load float, float* %x, align 4 %res = insertelement <4 x float>zeroinitializer, float %y, i32 0 ret <4 x float>%res } define <2 x double> @mov_test12(double* %x) { -; CHECK-LABEL: mov_test12: -; CHECK: # BB#0: -; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test12: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [6:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test12: +; SKX: # BB#0: +; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero sched: [5:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %y = load double, double* %x, align 8 %res = insertelement <2 x double>zeroinitializer, double %y, i32 0 ret <2 x double>%res } define <2 x i64> @mov_test13(i64 %x) { -; CHECK-LABEL: mov_test13: -; CHECK: # BB#0: -; CHECK-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test13: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test13: +; SKX: # BB#0: +; SKX-NEXT: vmovq %rdi, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = insertelement <2 x i64>zeroinitializer, i64 %x, i32 0 ret <2 x i64>%res } define <4 x i32> @mov_test14(i32 %x) { -; CHECK-LABEL: mov_test14: -; CHECK: # BB#0: -; CHECK-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test14: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test14: +; SKX: # BB#0: +; SKX-NEXT: vmovd %edi, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = insertelement <4 x i32>zeroinitializer, i32 %x, i32 0 ret <4 x i32>%res } define <4 x i32> @mov_test15(i32* %x) { -; CHECK-LABEL: mov_test15: -; CHECK: # BB#0: -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test15: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [6:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test15: +; SKX: # BB#0: +; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [5:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %y = load i32, i32* %x, align 4 %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0 ret <4 x i32>%res } define <16 x i32> @mov_test16(i8 * %addr) { -; CHECK-LABEL: mov_test16: -; CHECK: # BB#0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test16: +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x i32>* %res = load <16 x i32>, <16 x i32>* %vaddr, align 1 ret <16 x i32>%res } define <16 x i32> @mov_test17(i8 * %addr) { -; CHECK-LABEL: mov_test17: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test17: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test17: +; SKX: # BB#0: +; SKX-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x i32>* %res = load <16 x i32>, <16 x i32>* %vaddr, align 64 ret <16 x i32>%res } define void @mov_test18(i8 * %addr, <8 x i64> %data) { -; CHECK-LABEL: mov_test18: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test18: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps %zmm0, (%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test18: +; SKX: # BB#0: +; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x i64>* store <8 x i64>%data, <8 x i64>* %vaddr, align 64 ret void } define void @mov_test19(i8 * %addr, <16 x i32> %data) { -; CHECK-LABEL: mov_test19: -; CHECK: # BB#0: -; CHECK-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test19: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovups %zmm0, (%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test19: +; SKX: # BB#0: +; SKX-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x i32>* store <16 x i32>%data, <16 x i32>* %vaddr, align 1 ret void } define void @mov_test20(i8 * %addr, <16 x i32> %data) { -; CHECK-LABEL: mov_test20: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test20: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps %zmm0, (%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test20: +; SKX: # BB#0: +; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x i32>* store <16 x i32>%data, <16 x i32>* %vaddr, align 64 ret void } define <8 x i64> @mov_test21(i8 * %addr) { -; CHECK-LABEL: mov_test21: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test21: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test21: +; SKX: # BB#0: +; SKX-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x i64>* %res = load <8 x i64>, <8 x i64>* %vaddr, align 64 ret <8 x i64>%res } define void @mov_test22(i8 * %addr, <8 x i64> %data) { -; CHECK-LABEL: mov_test22: -; CHECK: # BB#0: -; CHECK-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test22: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovups %zmm0, (%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test22: +; SKX: # BB#0: +; SKX-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x i64>* store <8 x i64>%data, <8 x i64>* %vaddr, align 1 ret void } define <8 x i64> @mov_test23(i8 * %addr) { -; CHECK-LABEL: mov_test23: -; CHECK: # BB#0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test23: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test23: +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x i64>* %res = load <8 x i64>, <8 x i64>* %vaddr, align 1 ret <8 x i64>%res } define void @mov_test24(i8 * %addr, <8 x double> %data) { -; CHECK-LABEL: mov_test24: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test24: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps %zmm0, (%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test24: +; SKX: # BB#0: +; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x double>* store <8 x double>%data, <8 x double>* %vaddr, align 64 ret void } define <8 x double> @mov_test25(i8 * %addr) { -; CHECK-LABEL: mov_test25: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test25: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test25: +; SKX: # BB#0: +; SKX-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x double>* %res = load <8 x double>, <8 x double>* %vaddr, align 64 ret <8 x double>%res } define void @mov_test26(i8 * %addr, <16 x float> %data) { -; CHECK-LABEL: mov_test26: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test26: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps %zmm0, (%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test26: +; SKX: # BB#0: +; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x float>* store <16 x float>%data, <16 x float>* %vaddr, align 64 ret void } define <16 x float> @mov_test27(i8 * %addr) { -; CHECK-LABEL: mov_test27: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test27: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test27: +; SKX: # BB#0: +; SKX-NEXT: vmovaps (%rdi), %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x float>* %res = load <16 x float>, <16 x float>* %vaddr, align 64 ret <16 x float>%res } define void @mov_test28(i8 * %addr, <8 x double> %data) { -; CHECK-LABEL: mov_test28: -; CHECK: # BB#0: -; CHECK-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test28: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovups %zmm0, (%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test28: +; SKX: # BB#0: +; SKX-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x double>* store <8 x double>%data, <8 x double>* %vaddr, align 1 ret void } define <8 x double> @mov_test29(i8 * %addr) { -; CHECK-LABEL: mov_test29: -; CHECK: # BB#0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test29: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test29: +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <8 x double>* %res = load <8 x double>, <8 x double>* %vaddr, align 1 ret <8 x double>%res } define void @mov_test30(i8 * %addr, <16 x float> %data) { -; CHECK-LABEL: mov_test30: -; CHECK: # BB#0: -; CHECK-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test30: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovups %zmm0, (%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test30: +; SKX: # BB#0: +; SKX-NEXT: vmovups %zmm0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x float>* store <16 x float>%data, <16 x float>* %vaddr, align 1 ret void } define <16 x float> @mov_test31(i8 * %addr) { -; CHECK-LABEL: mov_test31: -; CHECK: # BB#0: -; CHECK-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test31: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test31: +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %vaddr = bitcast i8* %addr to <16 x float>* %res = load <16 x float>, <16 x float>* %vaddr, align 1 ret <16 x float>%res } define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { -; CHECK-LABEL: mov_test32: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test32: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* %r = load <16 x i32>, <16 x i32>* %vaddr, align 64 @@ -5131,12 +6420,19 @@ define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { } define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { -; CHECK-LABEL: mov_test33: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test33: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test33: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* %r = load <16 x i32>, <16 x i32>* %vaddr, align 1 @@ -5145,12 +6441,19 @@ define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { } define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) { -; CHECK-LABEL: mov_test34: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test34: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test34: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* %r = load <16 x i32>, <16 x i32>* %vaddr, align 64 @@ -5159,12 +6462,19 @@ define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) { } define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) { -; CHECK-LABEL: mov_test35: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test35: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test35: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x i32>* %r = load <16 x i32>, <16 x i32>* %vaddr, align 1 @@ -5173,12 +6483,19 @@ define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) { } define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { -; CHECK-LABEL: mov_test36: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test36: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test36: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* %r = load <8 x i64>, <8 x i64>* %vaddr, align 64 @@ -5187,12 +6504,19 @@ define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { } define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { -; CHECK-LABEL: mov_test37: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test37: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test37: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* %r = load <8 x i64>, <8 x i64>* %vaddr, align 1 @@ -5201,12 +6525,19 @@ define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { } define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) { -; CHECK-LABEL: mov_test38: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test38: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test38: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* %r = load <8 x i64>, <8 x i64>* %vaddr, align 64 @@ -5215,12 +6546,19 @@ define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) { } define <8 x i64> @mov_test39(i8 * %addr, <8 x i64> %mask1) { -; CHECK-LABEL: mov_test39: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test39: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test39: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i64> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x i64>* %r = load <8 x i64>, <8 x i64>* %vaddr, align 1 @@ -5229,12 +6567,19 @@ define <8 x i64> @mov_test39(i8 * %addr, <8 x i64> %mask1) { } define <16 x float> @mov_test40(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { -; CHECK-LABEL: mov_test40: -; CHECK: # BB#0: -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test40: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:1.00] +; GENERIC-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vmovaps (%rdi), %zmm0 {%k1} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test40: +; SKX: # BB#0: +; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovaps (%rdi), %zmm0 {%k1} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* %r = load <16 x float>, <16 x float>* %vaddr, align 64 @@ -5243,12 +6588,19 @@ define <16 x float> @mov_test40(i8 * %addr, <16 x float> %old, <16 x float> %mas } define <16 x float> @mov_test41(i8 * %addr, <16 x float> %old, <16 x float> %mask1) { -; CHECK-LABEL: mov_test41: -; CHECK: # BB#0: -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test41: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:1.00] +; GENERIC-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vmovups (%rdi), %zmm0 {%k1} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test41: +; SKX: # BB#0: +; SKX-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovups (%rdi), %zmm0 {%k1} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* %r = load <16 x float>, <16 x float>* %vaddr, align 1 @@ -5257,12 +6609,19 @@ define <16 x float> @mov_test41(i8 * %addr, <16 x float> %old, <16 x float> %mas } define <16 x float> @mov_test42(i8 * %addr, <16 x float> %mask1) { -; CHECK-LABEL: mov_test42: -; CHECK: # BB#0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test42: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test42: +; SKX: # BB#0: +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* %r = load <16 x float>, <16 x float>* %vaddr, align 64 @@ -5271,12 +6630,19 @@ define <16 x float> @mov_test42(i8 * %addr, <16 x float> %mask1) { } define <16 x float> @mov_test43(i8 * %addr, <16 x float> %mask1) { -; CHECK-LABEL: mov_test43: -; CHECK: # BB#0: -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test43: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test43: +; SKX: # BB#0: +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <16 x float> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <16 x float>* %r = load <16 x float>, <16 x float>* %vaddr, align 1 @@ -5285,12 +6651,19 @@ define <16 x float> @mov_test43(i8 * %addr, <16 x float> %mask1) { } define <8 x double> @mov_test44(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { -; CHECK-LABEL: mov_test44: -; CHECK: # BB#0: -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test44: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00] +; GENERIC-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vmovapd (%rdi), %zmm0 {%k1} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test44: +; SKX: # BB#0: +; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovapd (%rdi), %zmm0 {%k1} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* %r = load <8 x double>, <8 x double>* %vaddr, align 64 @@ -5299,12 +6672,19 @@ define <8 x double> @mov_test44(i8 * %addr, <8 x double> %old, <8 x double> %mas } define <8 x double> @mov_test45(i8 * %addr, <8 x double> %old, <8 x double> %mask1) { -; CHECK-LABEL: mov_test45: -; CHECK: # BB#0: -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test45: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00] +; GENERIC-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vmovupd (%rdi), %zmm0 {%k1} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test45: +; SKX: # BB#0: +; SKX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovupd (%rdi), %zmm0 {%k1} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* %r = load <8 x double>, <8 x double>* %vaddr, align 1 @@ -5313,12 +6693,19 @@ define <8 x double> @mov_test45(i8 * %addr, <8 x double> %old, <8 x double> %mas } define <8 x double> @mov_test46(i8 * %addr, <8 x double> %mask1) { -; CHECK-LABEL: mov_test46: -; CHECK: # BB#0: -; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test46: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test46: +; SKX: # BB#0: +; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* %r = load <8 x double>, <8 x double>* %vaddr, align 64 @@ -5327,12 +6714,19 @@ define <8 x double> @mov_test46(i8 * %addr, <8 x double> %mask1) { } define <8 x double> @mov_test47(i8 * %addr, <8 x double> %mask1) { -; CHECK-LABEL: mov_test47: -; CHECK: # BB#0: -; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mov_test47: +; GENERIC: # BB#0: +; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] +; GENERIC-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mov_test47: +; SKX: # BB#0: +; SKX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %mask = fcmp one <8 x double> %mask1, zeroinitializer %vaddr = bitcast i8* %addr to <8 x double>* %r = load <8 x double>, <8 x double>* %vaddr, align 1 @@ -5341,20 +6735,21 @@ define <8 x double> @mov_test47(i8 * %addr, <8 x double> %mask1) { } define i16 @mask16(i16 %x) { -; CHECK-LABEL: mask16: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k0 # sched: [1:1.00] -; CHECK-NEXT: knotw %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AX %AX %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mask16: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k0 +; GENERIC-NEXT: knotw %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: mask16: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: knotw %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: # kill: %AX %AX %EAX +; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, %ret = bitcast <16 x i1> %m1 to i16 @@ -5362,18 +6757,19 @@ define i16 @mask16(i16 %x) { } define i32 @mask16_zext(i16 %x) { -; CHECK-LABEL: mask16_zext: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k0 # sched: [1:1.00] -; CHECK-NEXT: knotw %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovw %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mask16_zext: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k0 +; GENERIC-NEXT: knotw %k0, %k0 +; GENERIC-NEXT: kmovw %k0, %eax +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: mask16_zext: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: knotw %k0, %k0 -; SKX-NEXT: kmovw %k0, %eax -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovw %k0, %eax # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, %m2 = bitcast <16 x i1> %m1 to i16 @@ -5382,20 +6778,21 @@ define i32 @mask16_zext(i16 %x) { } define i8 @mask8(i8 %x) { -; CHECK-LABEL: mask8: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k0 # sched: [1:1.00] -; CHECK-NEXT: knotb %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AL %AL %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mask8: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k0 +; GENERIC-NEXT: knotb %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AL %AL %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: mask8: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: knotb %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: knotb %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: # kill: %AL %AL %EAX +; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, %ret = bitcast <8 x i1> %m1 to i8 @@ -5403,18 +6800,19 @@ define i8 @mask8(i8 %x) { } define i32 @mask8_zext(i8 %x) { -; CHECK-LABEL: mask8_zext: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k0 # sched: [1:1.00] -; CHECK-NEXT: knotb %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovb %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mask8_zext: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k0 +; GENERIC-NEXT: knotb %k0, %k0 +; GENERIC-NEXT: kmovb %k0, %eax +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: mask8_zext: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: knotb %k0, %k0 -; SKX-NEXT: kmovb %k0, %eax -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: knotb %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovb %k0, %eax # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, %m2 = bitcast <8 x i1> %m1 to i8 @@ -5423,12 +6821,19 @@ define i32 @mask8_zext(i8 %x) { } define void @mask16_mem(i16* %ptr) { -; CHECK-LABEL: mask16_mem: -; CHECK: # BB#0: -; CHECK-NEXT: kmovw (%rdi), %k0 # sched: [7:1.00] -; CHECK-NEXT: knotw %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mask16_mem: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovw (%rdi), %k0 +; GENERIC-NEXT: knotw %k0, %k0 +; GENERIC-NEXT: kmovw %k0, (%rdi) +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mask16_mem: +; SKX: # BB#0: +; SKX-NEXT: kmovw (%rdi), %k0 # sched: [7:1.00] +; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = load i16, i16* %ptr, align 4 %m0 = bitcast i16 %x to <16 x i1> %m1 = xor <16 x i1> %m0, @@ -5438,18 +6843,19 @@ define void @mask16_mem(i16* %ptr) { } define void @mask8_mem(i8* %ptr) { -; CHECK-LABEL: mask8_mem: -; CHECK: # BB#0: -; CHECK-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00] -; CHECK-NEXT: knotb %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mask8_mem: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovb (%rdi), %k0 +; GENERIC-NEXT: knotb %k0, %k0 +; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: mask8_mem: -; SKX: ## BB#0: -; SKX-NEXT: kmovb (%rdi), %k0 -; SKX-NEXT: knotb %k0, %k0 -; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00] +; SKX-NEXT: knotb %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = load i8, i8* %ptr, align 4 %m0 = bitcast i8 %x to <8 x i1> %m1 = xor <8 x i1> %m0, @@ -5459,14 +6865,23 @@ define void @mask8_mem(i8* %ptr) { } define i16 @mand16(i16 %x, i16 %y) { -; CHECK-LABEL: mand16: -; CHECK: # BB#0: -; CHECK-NEXT: movl %edi, %eax # sched: [1:0.25] -; CHECK-NEXT: xorl %esi, %eax # sched: [1:0.25] -; CHECK-NEXT: andl %esi, %edi # sched: [1:0.25] -; CHECK-NEXT: orl %eax, %edi # sched: [1:0.25] -; CHECK-NEXT: movl %edi, %eax # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mand16: +; GENERIC: # BB#0: +; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33] +; GENERIC-NEXT: xorl %esi, %eax # sched: [1:0.33] +; GENERIC-NEXT: andl %esi, %edi # sched: [1:0.33] +; GENERIC-NEXT: orl %eax, %edi # sched: [1:0.33] +; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: mand16: +; SKX: # BB#0: +; SKX-NEXT: movl %edi, %eax # sched: [1:0.25] +; SKX-NEXT: xorl %esi, %eax # sched: [1:0.25] +; SKX-NEXT: andl %esi, %edi # sched: [1:0.25] +; SKX-NEXT: orl %eax, %edi # sched: [1:0.25] +; SKX-NEXT: movl %edi, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %ma = bitcast i16 %x to <16 x i1> %mb = bitcast i16 %y to <16 x i1> %mc = and <16 x i1> %ma, %mb @@ -5477,26 +6892,27 @@ define i16 @mand16(i16 %x, i16 %y) { } define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) { -; CHECK-LABEL: mand16_mem: -; CHECK: # BB#0: -; CHECK-NEXT: kmovw (%rdi), %k0 # sched: [7:1.00] -; CHECK-NEXT: kmovw (%rsi), %k1 # sched: [7:1.00] -; CHECK-NEXT: kandw %k1, %k0, %k2 # sched: [1:1.00] -; CHECK-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: korw %k0, %k2, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AX %AX %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: mand16_mem: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovw (%rdi), %k0 +; GENERIC-NEXT: kmovw (%rsi), %k1 +; GENERIC-NEXT: kandw %k1, %k0, %k2 +; GENERIC-NEXT: kxorw %k1, %k0, %k0 +; GENERIC-NEXT: korw %k0, %k2, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: mand16_mem: -; SKX: ## BB#0: -; SKX-NEXT: kmovw (%rdi), %k0 -; SKX-NEXT: kmovw (%rsi), %k1 -; SKX-NEXT: kandw %k1, %k0, %k2 -; SKX-NEXT: kxorw %k1, %k0, %k0 -; SKX-NEXT: korw %k0, %k2, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovw (%rdi), %k0 # sched: [7:1.00] +; SKX-NEXT: kmovw (%rsi), %k1 # sched: [7:1.00] +; SKX-NEXT: kandw %k1, %k0, %k2 # sched: [1:1.00] +; SKX-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: korw %k0, %k2, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: # kill: %AX %AX %EAX +; SKX-NEXT: retq # sched: [7:1.00] %ma = load <16 x i1>, <16 x i1>* %x %mb = load <16 x i1>, <16 x i1>* %y %mc = and <16 x i1> %ma, %mb @@ -5507,20 +6923,21 @@ define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) { } define i8 @shuf_test1(i16 %v) nounwind { -; CHECK-LABEL: shuf_test1: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k0 # sched: [1:1.00] -; CHECK-NEXT: kshiftrw $8, %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AL %AL %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: shuf_test1: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k0 +; GENERIC-NEXT: kshiftrw $8, %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AL %AL %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: shuf_test1: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: kshiftrw $8, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: kshiftrw $8, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: # kill: %AL %AL %EAX +; SKX-NEXT: retq # sched: [7:1.00] %v1 = bitcast i16 %v to <16 x i1> %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> %mask1 = bitcast <8 x i1> %mask to i8 @@ -5528,24 +6945,25 @@ define i8 @shuf_test1(i16 %v) nounwind { } define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { -; CHECK-LABEL: zext_test1: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kshiftlw $10, %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kshiftrw $15, %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: andl $1, %eax # sched: [1:0.25] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_test1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; GENERIC-NEXT: kshiftlw $10, %k0, %k0 +; GENERIC-NEXT: kshiftrw $15, %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: andl $1, %eax # sched: [1:0.33] +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_test1: -; SKX: ## BB#0: -; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; SKX-NEXT: kshiftlw $10, %k0, %k0 -; SKX-NEXT: kshiftrw $15, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: kshiftlw $10, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kshiftrw $15, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: andl $1, %eax # sched: [1:0.25] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i32 @@ -5553,26 +6971,27 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) { } define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { -; CHECK-LABEL: zext_test2: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kshiftlw $10, %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kshiftrw $15, %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: andl $1, %eax # sched: [1:0.25] -; CHECK-NEXT: # kill: %AX %AX %EAX -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_test2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; GENERIC-NEXT: kshiftlw $10, %k0, %k0 +; GENERIC-NEXT: kshiftrw $15, %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: andl $1, %eax # sched: [1:0.33] +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_test2: -; SKX: ## BB#0: -; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; SKX-NEXT: kshiftlw $10, %k0, %k0 -; SKX-NEXT: kshiftrw $15, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andl $1, %eax -; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: kshiftlw $10, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kshiftrw $15, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: andl $1, %eax # sched: [1:0.25] +; SKX-NEXT: # kill: %AX %AX %EAX +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i16 @@ -5580,26 +6999,27 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) { } define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { -; CHECK-LABEL: zext_test3: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kshiftlw $10, %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kshiftrw $15, %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: andb $1, %al # sched: [1:0.25] -; CHECK-NEXT: # kill: %AL %AL %EAX -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: zext_test3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 +; GENERIC-NEXT: kshiftlw $10, %k0, %k0 +; GENERIC-NEXT: kshiftrw $15, %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: andb $1, %al # sched: [1:0.33] +; GENERIC-NEXT: # kill: %AL %AL %EAX +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: zext_test3: -; SKX: ## BB#0: -; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; SKX-NEXT: kshiftlw $10, %k0, %k0 -; SKX-NEXT: kshiftrw $15, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andb $1, %al -; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: kshiftlw $10, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kshiftrw $15, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: andb $1, %al # sched: [1:0.25] +; SKX-NEXT: # kill: %AL %AL %EAX +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %cmp_res = icmp ugt <16 x i32> %a, %b %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5 %res = zext i1 %cmp_res.i1 to i8 @@ -5607,20 +7027,21 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { } define i8 @conv1(<8 x i1>* %R) { -; CHECK-LABEL: conv1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: movb $-2, -{{[0-9]+}}(%rsp) # sched: [1:1.00] -; CHECK-NEXT: movb $-2, %al # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: conv1: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: kxnorw %k0, %k0, %k0 +; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: movb $-2, -{{[0-9]+}}(%rsp) # sched: [5:1.00] +; GENERIC-NEXT: movb $-2, %al # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: conv1: -; SKX: ## BB#0: ## %entry -; SKX-NEXT: kxnorw %k0, %k0, %k0 -; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: movb $-2, -{{[0-9]+}}(%rsp) -; SKX-NEXT: movb $-2, %al -; SKX-NEXT: retq +; SKX: # BB#0: # %entry +; SKX-NEXT: kxnorw %k0, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: movb $-2, -{{[0-9]+}}(%rsp) # sched: [1:1.00] +; SKX-NEXT: movb $-2, %al # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] entry: store <8 x i1> , <8 x i1>* %R @@ -5632,22 +7053,23 @@ entry: } define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1) { -; CHECK-LABEL: test4: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: kandnw %k0, %k1, %k0 # sched: [1:1.00] -; CHECK-NEXT: vpmovm2d %k0, %xmm0 -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 +; GENERIC-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: kandnw %k0, %k1, %k0 +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test4: -; SKX: ## BB#0: -; SKX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 -; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 -; SKX-NEXT: kandnw %k0, %k1, %k0 +; SKX: # BB#0: +; SKX-NEXT: vpcmpgtq %ymm1, %ymm0, %k0 # sched: [3:1.00] +; SKX-NEXT: vpcmpgtq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: kandnw %k0, %k1, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2d %k0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x_gt_y = icmp sgt <4 x i64> %x, %y %x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1 %res = icmp sgt <4 x i1>%x_gt_y, %x1_gt_y1 @@ -5656,20 +7078,21 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1 } define <2 x i64> @vcmp_test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1) { -; CHECK-LABEL: vcmp_test5: -; CHECK: # BB#0: -; CHECK-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: kandnw %k1, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: vpmovm2q %k0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vcmp_test5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 +; GENERIC-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: kandnw %k1, %k0, %k0 +; GENERIC-NEXT: vpmovm2q %k0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: vcmp_test5: -; SKX: ## BB#0: -; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 -; SKX-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 -; SKX-NEXT: kandnw %k1, %k0, %k0 +; SKX: # BB#0: +; SKX-NEXT: vpcmpgtq %xmm0, %xmm1, %k0 # sched: [3:1.00] +; SKX-NEXT: vpcmpgtq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: kandnw %k1, %k0, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2q %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %x_gt_y = icmp slt <2 x i64> %x, %y %x1_gt_y1 = icmp sgt <2 x i64> %x1, %y1 %res = icmp slt <2 x i1>%x_gt_y, %x1_gt_y1 @@ -5689,24 +7112,25 @@ false: ret void } define void @vcmp_test7(<8 x i1> %mask) { -; CHECK-LABEL: vcmp_test7: -; CHECK: # BB#0: # %allocas -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: movb $85, %al # sched: [1:0.25] -; CHECK-NEXT: kmovd %eax, %k1 # sched: [1:1.00] -; CHECK-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: ktestb %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vcmp_test7: +; GENERIC: # BB#0: # %allocas +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k0 +; GENERIC-NEXT: movb $85, %al # sched: [1:0.33] +; GENERIC-NEXT: kmovd %eax, %k1 +; GENERIC-NEXT: korb %k1, %k0, %k0 +; GENERIC-NEXT: ktestb %k0, %k0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: vcmp_test7: -; SKX: ## BB#0: ## %allocas -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k0 -; SKX-NEXT: movb $85, %al -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: korb %k1, %k0, %k0 -; SKX-NEXT: ktestb %k0, %k0 -; SKX-NEXT: retq +; SKX: # BB#0: # %allocas +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: movb $85, %al # sched: [1:0.25] +; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00] +; SKX-NEXT: korb %k1, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: ktestb %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] allocas: %a= or <8 x i1> %mask, %b = bitcast <8 x i1> %a to i8 @@ -5720,36 +7144,37 @@ false: ret void } define <16 x i8> @vcmp_test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { -; CHECK-LABEL: vcmp_test8: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: cmpl %esi, %edi # sched: [1:0.25] -; CHECK-NEXT: jg .LBB386_1 # sched: [1:0.50] -; CHECK-NEXT: # BB#2: -; CHECK-NEXT: vpcmpltud %zmm2, %zmm1, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2b %k0, %xmm0 -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; CHECK-NEXT: .LBB386_1: -; CHECK-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2b %k0, %xmm0 -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vcmp_test8: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33] +; GENERIC-NEXT: jg .LBB386_1 # sched: [1:1.00] +; GENERIC-NEXT: # BB#2: +; GENERIC-NEXT: vpcmpltud %zmm2, %zmm1, %k0 +; GENERIC-NEXT: vpmovm2b %k0, %xmm0 +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; GENERIC-NEXT: .LBB386_1: +; GENERIC-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 +; GENERIC-NEXT: vpmovm2b %k0, %xmm0 +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: vcmp_test8: -; SKX: ## BB#0: -; SKX-NEXT: cmpl %esi, %edi -; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; SKX-NEXT: jg LBB17_1 -; SKX-NEXT: ## BB#2: -; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0 +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25] +; SKX-NEXT: jg .LBB386_1 # sched: [1:0.50] +; SKX-NEXT: # BB#2: +; SKX-NEXT: vpcmpltud %zmm2, %zmm1, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2b %k0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq -; SKX-NEXT: LBB17_1: -; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] +; SKX-NEXT: .LBB386_1: +; SKX-NEXT: vpcmpgtd %zmm2, %zmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2b %k0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %cond = icmp sgt i32 %a1, %b1 %cmp1 = icmp sgt <16 x i32> %a, zeroinitializer %cmp2 = icmp ult <16 x i32> %b, zeroinitializer @@ -5758,32 +7183,33 @@ define <16 x i8> @vcmp_test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) { ret <16 x i8> %res } define <16 x i1> @vpmov_test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) { -; CHECK-LABEL: vpmov_test9: -; CHECK: # BB#0: -; CHECK-NEXT: cmpl %esi, %edi # sched: [1:0.25] -; CHECK-NEXT: jg .LBB387_1 # sched: [1:0.50] -; CHECK-NEXT: # BB#2: -; CHECK-NEXT: vpsllw $7, %xmm1, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: jmp .LBB387_3 # sched: [1:0.50] -; CHECK-NEXT: .LBB387_1: -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: .LBB387_3: -; CHECK-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: vpmovm2b %k0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vpmov_test9: +; GENERIC: # BB#0: +; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33] +; GENERIC-NEXT: jg .LBB387_1 # sched: [1:1.00] +; GENERIC-NEXT: # BB#2: +; GENERIC-NEXT: vpsllw $7, %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: jmp .LBB387_3 # sched: [1:1.00] +; GENERIC-NEXT: .LBB387_1: +; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: .LBB387_3: +; GENERIC-NEXT: vpmovb2m %xmm0, %k0 +; GENERIC-NEXT: vpmovm2b %k0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: vpmov_test9: -; SKX: ## BB#0: -; SKX-NEXT: cmpl %esi, %edi -; SKX-NEXT: jg LBB18_1 -; SKX-NEXT: ## BB#2: -; SKX-NEXT: vpsllw $7, %xmm1, %xmm0 -; SKX-NEXT: jmp LBB18_3 -; SKX-NEXT: LBB18_1: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 -; SKX-NEXT: LBB18_3: -; SKX-NEXT: vpmovb2m %xmm0, %k0 +; SKX: # BB#0: +; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25] +; SKX-NEXT: jg .LBB387_1 # sched: [1:0.50] +; SKX-NEXT: # BB#2: +; SKX-NEXT: vpsllw $7, %xmm1, %xmm0 # sched: [1:0.50] +; SKX-NEXT: jmp .LBB387_3 # sched: [1:0.50] +; SKX-NEXT: .LBB387_1: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: .LBB387_3: +; SKX-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2b %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp sgt i32 %a1, %b1 %c = select i1 %mask, <16 x i1>%a, <16 x i1>%b ret <16 x i1>%c @@ -5794,42 +7220,48 @@ define <16 x i1> @vpmov_test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) { } define <4 x i1> @vmov_test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) { -; CHECK-LABEL: vmov_test11: -; CHECK: # BB#0: -; CHECK-NEXT: cmpl %esi, %edi # sched: [1:0.25] -; CHECK-NEXT: jg .LBB389_1 # sched: [1:0.50] -; CHECK-NEXT: # BB#2: -; CHECK-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: jmp .LBB389_3 # sched: [1:0.50] -; CHECK-NEXT: .LBB389_1: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: .LBB389_3: -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2d %k0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vmov_test11: +; GENERIC: # BB#0: +; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33] +; GENERIC-NEXT: jg .LBB389_1 # sched: [1:1.00] +; GENERIC-NEXT: # BB#2: +; GENERIC-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: jmp .LBB389_3 # sched: [1:1.00] +; GENERIC-NEXT: .LBB389_1: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: .LBB389_3: +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: vmov_test11: -; SKX: ## BB#0: -; SKX-NEXT: cmpl %esi, %edi -; SKX-NEXT: jg LBB20_1 -; SKX-NEXT: ## BB#2: -; SKX-NEXT: vpslld $31, %xmm1, %xmm0 -; SKX-NEXT: jmp LBB20_3 -; SKX-NEXT: LBB20_1: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: LBB20_3: -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 +; SKX: # BB#0: +; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25] +; SKX-NEXT: jg .LBB389_1 # sched: [1:0.50] +; SKX-NEXT: # BB#2: +; SKX-NEXT: vpslld $31, %xmm1, %xmm0 # sched: [1:0.50] +; SKX-NEXT: jmp .LBB389_3 # sched: [1:0.50] +; SKX-NEXT: .LBB389_1: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: .LBB389_3: +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00] ; SKX-NEXT: vpmovm2d %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp sgt i32 %a1, %b1 %c = select i1 %mask, <4 x i1>%a, <4 x i1>%b ret <4 x i1>%c } define i32 @vmov_test12(i32 %x, i32 %y) { -; CHECK-LABEL: vmov_test12: -; CHECK: # BB#0: -; CHECK-NEXT: movl %edi, %eax # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vmov_test12: +; GENERIC: # BB#0: +; GENERIC-NEXT: movl %edi, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vmov_test12: +; SKX: # BB#0: +; SKX-NEXT: movl %edi, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 0 %c = select i1 %b, i32 %x, i32 %y @@ -5837,10 +7269,15 @@ define i32 @vmov_test12(i32 %x, i32 %y) { } define i32 @vmov_test13(i32 %x, i32 %y) { -; CHECK-LABEL: vmov_test13: -; CHECK: # BB#0: -; CHECK-NEXT: movl %esi, %eax # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vmov_test13: +; GENERIC: # BB#0: +; GENERIC-NEXT: movl %esi, %eax # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: vmov_test13: +; SKX: # BB#0: +; SKX-NEXT: movl %esi, %eax # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i16 21845 to <16 x i1> %b = extractelement <16 x i1> %a, i32 3 %c = select i1 %b, i32 %x, i32 %y @@ -5853,25 +7290,27 @@ define i32 @vmov_test13(i32 %x, i32 %y) { } define <16 x i1> @vmov_test15(i32 %x, i32 %y) { -; CHECK-LABEL: vmov_test15: -; CHECK: # BB#0: -; CHECK-NEXT: cmpl %esi, %edi # sched: [1:0.25] -; CHECK-NEXT: movw $21845, %ax # imm = 0x5555 -; CHECK-NEXT: # sched: [1:0.25] -; CHECK-NEXT: movw $1, %cx # sched: [1:0.25] -; CHECK-NEXT: cmovgw %ax, %cx # sched: [1:0.50] -; CHECK-NEXT: kmovd %ecx, %k0 # sched: [1:1.00] -; CHECK-NEXT: vpmovm2b %k0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vmov_test15: +; GENERIC: # BB#0: +; GENERIC-NEXT: cmpl %esi, %edi # sched: [1:0.33] +; GENERIC-NEXT: movw $21845, %ax # imm = 0x5555 +; GENERIC-NEXT: # sched: [1:0.33] +; GENERIC-NEXT: movw $1, %cx # sched: [1:0.33] +; GENERIC-NEXT: cmovgw %ax, %cx # sched: [2:0.67] +; GENERIC-NEXT: kmovd %ecx, %k0 +; GENERIC-NEXT: vpmovm2b %k0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: vmov_test15: -; SKX: ## BB#0: -; SKX-NEXT: cmpl %esi, %edi -; SKX-NEXT: movw $21845, %ax ## imm = 0x5555 -; SKX-NEXT: movw $1, %cx -; SKX-NEXT: cmovgw %ax, %cx -; SKX-NEXT: kmovd %ecx, %k0 +; SKX: # BB#0: +; SKX-NEXT: cmpl %esi, %edi # sched: [1:0.25] +; SKX-NEXT: movw $21845, %ax # imm = 0x5555 +; SKX-NEXT: # sched: [1:0.25] +; SKX-NEXT: movw $1, %cx # sched: [1:0.25] +; SKX-NEXT: cmovgw %ax, %cx # sched: [1:0.50] +; SKX-NEXT: kmovd %ecx, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2b %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i16 21845 to <16 x i1> %b = bitcast i16 1 to <16 x i1> %mask = icmp sgt i32 %x, %y @@ -5881,36 +7320,37 @@ define <16 x i1> @vmov_test15(i32 %x, i32 %y) { define <64 x i8> @vmov_test16(i64 %x) { ; -; CHECK-LABEL: vmov_test16: -; CHECK: # BB#0: -; CHECK-NEXT: kmovq %rdi, %k0 # sched: [1:1.00] -; CHECK-NEXT: movb $1, %al # sched: [1:0.25] -; CHECK-NEXT: kmovd %eax, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovm2b %k1, %zmm0 -; CHECK-NEXT: vpsllq $40, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovm2b %k0, %zmm1 -; CHECK-NEXT: movl $32, %eax # sched: [1:0.25] -; CHECK-NEXT: kmovd %eax, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] sched: [3:1.00] -; CHECK-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: vpmovm2b %k0, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vmov_test16: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovq %rdi, %k0 +; GENERIC-NEXT: movb $1, %al # sched: [1:0.33] +; GENERIC-NEXT: kmovd %eax, %k1 +; GENERIC-NEXT: vpmovm2b %k1, %zmm0 +; GENERIC-NEXT: vpsllq $40, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovm2b %k0, %zmm1 +; GENERIC-NEXT: movl $32, %eax # sched: [1:0.33] +; GENERIC-NEXT: kmovd %eax, %k1 +; GENERIC-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; GENERIC-NEXT: vpmovb2m %zmm0, %k0 +; GENERIC-NEXT: vpmovm2b %k0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: vmov_test16: -; SKX: ## BB#0: -; SKX-NEXT: kmovq %rdi, %k0 -; SKX-NEXT: movb $1, %al -; SKX-NEXT: kmovd %eax, %k1 +; SKX: # BB#0: +; SKX-NEXT: kmovq %rdi, %k0 # sched: [1:1.00] +; SKX-NEXT: movb $1, %al # sched: [1:0.25] +; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovm2b %k1, %zmm0 -; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 +; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovm2b %k0, %zmm1 -; SKX-NEXT: movl $32, %eax -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} -; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; SKX-NEXT: vpmovb2m %zmm0, %k0 +; SKX-NEXT: movl $32, %eax # sched: [1:0.25] +; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00] +; SKX-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2b %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i64 %x to <64 x i1> %b = insertelement <64 x i1>%a, i1 true, i32 5 %c = sext <64 x i1>%b to <64 x i8> @@ -5919,38 +7359,39 @@ define <64 x i8> @vmov_test16(i64 %x) { define <64 x i8> @vmov_test17(i64 %x, i32 %y, i32 %z) { ; -; CHECK-LABEL: vmov_test17: -; CHECK: # BB#0: -; CHECK-NEXT: kmovq %rdi, %k0 # sched: [1:1.00] -; CHECK-NEXT: cmpl %edx, %esi # sched: [1:0.25] -; CHECK-NEXT: setg %al # sched: [1:0.50] -; CHECK-NEXT: kmovd %eax, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpmovm2b %k1, %zmm0 -; CHECK-NEXT: vpsllq $40, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovm2b %k0, %zmm1 -; CHECK-NEXT: movl $32, %eax # sched: [1:0.25] -; CHECK-NEXT: kmovd %eax, %k1 # sched: [1:1.00] -; CHECK-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] sched: [3:1.00] -; CHECK-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: vpmovm2b %k0, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vmov_test17: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovq %rdi, %k0 +; GENERIC-NEXT: cmpl %edx, %esi # sched: [1:0.33] +; GENERIC-NEXT: setg %al # sched: [1:0.50] +; GENERIC-NEXT: kmovd %eax, %k1 +; GENERIC-NEXT: vpmovm2b %k1, %zmm0 +; GENERIC-NEXT: vpsllq $40, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovm2b %k0, %zmm1 +; GENERIC-NEXT: movl $32, %eax # sched: [1:0.33] +; GENERIC-NEXT: kmovd %eax, %k1 +; GENERIC-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] +; GENERIC-NEXT: vpmovb2m %zmm0, %k0 +; GENERIC-NEXT: vpmovm2b %k0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: vmov_test17: -; SKX: ## BB#0: -; SKX-NEXT: kmovq %rdi, %k0 -; SKX-NEXT: cmpl %edx, %esi -; SKX-NEXT: setg %al -; SKX-NEXT: kmovd %eax, %k1 +; SKX: # BB#0: +; SKX-NEXT: kmovq %rdi, %k0 # sched: [1:1.00] +; SKX-NEXT: cmpl %edx, %esi # sched: [1:0.25] +; SKX-NEXT: setg %al # sched: [1:0.50] +; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00] ; SKX-NEXT: vpmovm2b %k1, %zmm0 -; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 +; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 # sched: [1:0.50] ; SKX-NEXT: vpmovm2b %k0, %zmm1 -; SKX-NEXT: movl $32, %eax -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} -; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] -; SKX-NEXT: vpmovb2m %zmm0, %k0 +; SKX-NEXT: movl $32, %eax # sched: [1:0.25] +; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00] +; SKX-NEXT: vpblendmb %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2b %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %a = bitcast i64 %x to <64 x i1> %b = icmp sgt i32 %y, %z %c = insertelement <64 x i1>%a, i1 %b, i32 5 @@ -5959,50 +7400,47 @@ define <64 x i8> @vmov_test17(i64 %x, i32 %y, i32 %z) { } define <8 x i1> @vmov_test18(i8 %a, i16 %y) { -; CHECK-LABEL: vmov_test18: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k1 # sched: [1:1.00] -; CHECK-NEXT: kmovd %esi, %k2 # sched: [1:1.00] -; CHECK-NEXT: kshiftlw $7, %k2, %k0 # sched: [3:1.00] -; CHECK-NEXT: kshiftrw $15, %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kshiftlw $6, %k2, %k2 # sched: [3:1.00] -; CHECK-NEXT: kshiftrw $15, %k2, %k2 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2q %k1, %zmm0 -; CHECK-NEXT: vpmovm2q %k2, %zmm1 -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] sched: [8:0.50] -; CHECK-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 # sched: [3:1.00] -; CHECK-NEXT: vpmovq2m %zmm2, %k1 # sched: [1:1.00] -; CHECK-NEXT: kshiftlb $1, %k1, %k1 # sched: [3:1.00] -; CHECK-NEXT: kshiftrb $1, %k1, %k1 # sched: [3:1.00] -; CHECK-NEXT: kshiftlb $7, %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: korb %k0, %k1, %k0 # sched: [1:1.00] -; CHECK-NEXT: vpmovm2w %k0, %xmm0 -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vmov_test18: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k1 +; GENERIC-NEXT: kmovd %esi, %k2 +; GENERIC-NEXT: kshiftlw $7, %k2, %k0 +; GENERIC-NEXT: kshiftrw $15, %k0, %k0 +; GENERIC-NEXT: kshiftlw $6, %k2, %k2 +; GENERIC-NEXT: kshiftrw $15, %k2, %k2 +; GENERIC-NEXT: vpmovm2q %k1, %zmm0 +; GENERIC-NEXT: vpmovm2q %k2, %zmm1 +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] sched: [4:0.50] +; GENERIC-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; GENERIC-NEXT: vpmovq2m %zmm2, %k1 +; GENERIC-NEXT: kshiftlb $1, %k1, %k1 +; GENERIC-NEXT: kshiftrb $1, %k1, %k1 +; GENERIC-NEXT: kshiftlb $7, %k0, %k0 +; GENERIC-NEXT: korb %k0, %k1, %k0 +; GENERIC-NEXT: vpmovm2w %k0, %xmm0 +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: vmov_test18: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kshiftlw $7, %k1, %k2 -; SKX-NEXT: kshiftrw $15, %k2, %k2 -; SKX-NEXT: kmovd %k2, %eax -; SKX-NEXT: kshiftlw $6, %k1, %k1 -; SKX-NEXT: kshiftrw $15, %k1, %k1 -; SKX-NEXT: kmovd %k1, %ecx -; SKX-NEXT: vpmovm2q %k0, %zmm0 -; SKX-NEXT: kmovd %ecx, %k0 -; SKX-NEXT: vpmovm2q %k0, %zmm1 -; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] -; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 -; SKX-NEXT: vpmovq2m %zmm2, %k0 -; SKX-NEXT: kshiftlb $1, %k0, %k0 -; SKX-NEXT: kshiftrb $1, %k0, %k0 -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: kshiftlb $7, %k1, %k1 -; SKX-NEXT: korb %k1, %k0, %k0 +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k1 # sched: [1:1.00] +; SKX-NEXT: kmovd %esi, %k2 # sched: [1:1.00] +; SKX-NEXT: kshiftlw $7, %k2, %k0 # sched: [3:1.00] +; SKX-NEXT: kshiftrw $15, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: kshiftlw $6, %k2, %k2 # sched: [3:1.00] +; SKX-NEXT: kshiftrw $15, %k2, %k2 # sched: [3:1.00] +; SKX-NEXT: vpmovm2q %k1, %zmm0 +; SKX-NEXT: vpmovm2q %k2, %zmm1 +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7] sched: [8:0.50] +; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 # sched: [3:1.00] +; SKX-NEXT: vpmovq2m %zmm2, %k1 # sched: [1:1.00] +; SKX-NEXT: kshiftlb $1, %k1, %k1 # sched: [3:1.00] +; SKX-NEXT: kshiftrb $1, %k1, %k1 # sched: [3:1.00] +; SKX-NEXT: kshiftlb $7, %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: korb %k0, %k1, %k0 # sched: [1:1.00] ; SKX-NEXT: vpmovm2w %k0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = bitcast i8 %a to <8 x i1> %b1 = bitcast i16 %y to <16 x i1> %el1 = extractelement <16 x i1>%b1, i32 8 @@ -6012,151 +7450,159 @@ define <8 x i1> @vmov_test18(i8 %a, i16 %y) { ret <8 x i1>%d } define <32 x i16> @vmov_test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone { -; CHECK-LABEL: vmov_test21: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00] -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vmov_test21: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %ymm1, %k1 +; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: vmov_test21: -; SKX: ## BB#0: -; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 -; SKX-NEXT: vpmovb2m %ymm1, %k1 +; SKX: # BB#0: +; SKX-NEXT: vpsllw $7, %ymm1, %ymm1 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %ymm1, %k1 # sched: [1:1.00] ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } define void @vmov_test22(<4 x i1> %a, <4 x i1>* %addr) { -; CHECK-LABEL: vmov_test22: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vmov_test22: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 +; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: vmov_test22: -; SKX: ## BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 -; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] store <4 x i1> %a, <4 x i1>* %addr ret void } define void @vmov_test23(<2 x i1> %a, <2 x i1>* %addr) { -; CHECK-LABEL: vmov_test23: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: vmov_test23: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 +; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: vmov_test23: -; SKX: ## BB#0: -; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 -; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] store <2 x i1> %a, <2 x i1>* %addr ret void } define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) { -; CHECK-LABEL: store_v1i1: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k0 # sched: [1:1.00] -; CHECK-NEXT: kxnorw %k0, %k0, %k1 # sched: [1:1.00] -; CHECK-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovb %k0, (%rsi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_v1i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k0 +; GENERIC-NEXT: kxnorw %k0, %k0, %k1 +; GENERIC-NEXT: kxorw %k1, %k0, %k0 +; GENERIC-NEXT: kmovb %k0, (%rsi) +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: store_v1i1: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kxorw %k1, %k0, %k0 -; SKX-NEXT: kmovb %k0, (%rsi) -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: kxnorw %k0, %k0, %k1 # sched: [1:1.00] +; SKX-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovb %k0, (%rsi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = xor <1 x i1> %c, store <1 x i1> %x, <1 x i1>* %ptr, align 4 ret void } define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) { -; CHECK-LABEL: store_v2i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: knotw %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_v2i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmq %xmm0, %xmm0, %k0 +; GENERIC-NEXT: knotw %k0, %k0 +; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: store_v2i1: -; SKX: ## BB#0: -; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 -; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 -; SKX-NEXT: knotw %k0, %k0 -; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmq %xmm0, %xmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = xor <2 x i1> %c, store <2 x i1> %x, <2 x i1>* %ptr, align 4 ret void } define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) { -; CHECK-LABEL: store_v4i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: knotw %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_v4i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vptestmd %xmm0, %xmm0, %k0 +; GENERIC-NEXT: knotw %k0, %k0 +; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: store_v4i1: -; SKX: ## BB#0: -; SKX-NEXT: vpslld $31, %xmm0, %xmm0 -; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 -; SKX-NEXT: knotw %k0, %k0 -; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpslld $31, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vptestmd %xmm0, %xmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = xor <4 x i1> %c, store <4 x i1> %x, <4 x i1>* %ptr, align 4 ret void } define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) { -; CHECK-LABEL: store_v8i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: knotb %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_v8i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k0 +; GENERIC-NEXT: knotb %k0, %k0 +; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: store_v8i1: -; SKX: ## BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k0 -; SKX-NEXT: knotb %k0, %k0 -; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: knotb %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = xor <8 x i1> %c, store <8 x i1> %x, <8 x i1>* %ptr, align 4 ret void } define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) { -; CHECK-LABEL: store_v16i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: knotw %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_v16i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm0, %k0 +; GENERIC-NEXT: knotw %k0, %k0 +; GENERIC-NEXT: kmovw %k0, (%rdi) +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: store_v16i1: -; SKX: ## BB#0: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 -; SKX-NEXT: vpmovb2m %xmm0, %k0 -; SKX-NEXT: knotw %k0, %k0 -; SKX-NEXT: kmovw %k0, (%rdi) -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: knotw %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %x = xor <16 x i1> %c, store <16 x i1> %x, <16 x i1>* %ptr, align 4 ret void @@ -6176,12 +7622,19 @@ define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) { @f1.v = internal unnamed_addr global i1 false, align 4 define void @f1(i32 %c) { -; CHECK-LABEL: f1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movzbl {{.*}}(%rip), %edi # sched: [5:0.50] -; CHECK-NEXT: xorl $1, %edi # sched: [1:0.25] -; CHECK-NEXT: movb %dil, {{.*}}(%rip) # sched: [1:1.00] -; CHECK-NEXT: jmp f2 # TAILCALL +; GENERIC-LABEL: f1: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: movzbl {{.*}}(%rip), %edi # sched: [5:0.50] +; GENERIC-NEXT: xorl $1, %edi # sched: [1:0.33] +; GENERIC-NEXT: movb %dil, {{.*}}(%rip) # sched: [5:1.00] +; GENERIC-NEXT: jmp f2 # TAILCALL +; +; SKX-LABEL: f1: +; SKX: # BB#0: # %entry +; SKX-NEXT: movzbl {{.*}}(%rip), %edi # sched: [5:0.50] +; SKX-NEXT: xorl $1, %edi # sched: [1:0.25] +; SKX-NEXT: movb %dil, {{.*}}(%rip) # sched: [1:1.00] +; SKX-NEXT: jmp f2 # TAILCALL entry: %.b1 = load i1, i1* @f1.v, align 4 %not..b1 = xor i1 %.b1, true @@ -6194,93 +7647,107 @@ entry: declare void @f2(i32) #1 define void @store_i16_i1(i16 %x, i1 *%y) { -; CHECK-LABEL: store_i16_i1: -; CHECK: # BB#0: -; CHECK-NEXT: andl $1, %edi # sched: [1:0.25] -; CHECK-NEXT: movb %dil, (%rsi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_i16_i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: andl $1, %edi # sched: [1:0.33] +; GENERIC-NEXT: movb %dil, (%rsi) # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: store_i16_i1: +; SKX: # BB#0: +; SKX-NEXT: andl $1, %edi # sched: [1:0.25] +; SKX-NEXT: movb %dil, (%rsi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %c = trunc i16 %x to i1 store i1 %c, i1* %y ret void } define void @store_i8_i1(i8 %x, i1 *%y) { -; CHECK-LABEL: store_i8_i1: -; CHECK: # BB#0: -; CHECK-NEXT: andl $1, %edi # sched: [1:0.25] -; CHECK-NEXT: movb %dil, (%rsi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_i8_i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: andl $1, %edi # sched: [1:0.33] +; GENERIC-NEXT: movb %dil, (%rsi) # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: store_i8_i1: +; SKX: # BB#0: +; SKX-NEXT: andl $1, %edi # sched: [1:0.25] +; SKX-NEXT: movb %dil, (%rsi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %c = trunc i8 %x to i1 store i1 %c, i1* %y ret void } define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) { -; CHECK-LABEL: test_build_vec_v32i1: -; CHECK: # BB#0: -; CHECK-NEXT: movl $1497715861, %eax # imm = 0x59455495 -; CHECK-NEXT: # sched: [1:0.25] -; CHECK-NEXT: kmovd %eax, %k1 # sched: [1:1.00] -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_build_vec_v32i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: movl $1497715861, %eax # imm = 0x59455495 +; GENERIC-NEXT: # sched: [1:0.33] +; GENERIC-NEXT: kmovd %eax, %k1 +; GENERIC-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_build_vec_v32i1: -; SKX: ## BB#0: -; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495 -; SKX-NEXT: kmovd %eax, %k1 +; SKX: # BB#0: +; SKX-NEXT: movl $1497715861, %eax # imm = 0x59455495 +; SKX-NEXT: # sched: [1:0.25] +; SKX-NEXT: kmovd %eax, %k1 # sched: [1:1.00] ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %ret = select <32 x i1> , <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret } define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { -; CHECK-LABEL: test_build_vec_v64i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero sched: [8:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_build_vec_v64i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_build_vec_v64i1: -; SKX: ## BB#0: -; SKX-NEXT: movabsq $6432645796886517060, %rax ## imm = 0x5945594549549544 -; SKX-NEXT: kmovq %rax, %k1 -; SKX-NEXT: vmovdqu8 %zmm0, %zmm0 {%k1} {z} -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zmm0[2],zero,zero,zero,zmm0[6],zero,zmm0[8],zero,zmm0[10],zero,zmm0[12],zero,zero,zmm0[15],zero,zero,zmm0[18],zero,zmm0[20],zero,zmm0[22],zero,zmm0[24],zero,zero,zmm0[27],zero,zero,zmm0[30],zero,zmm0[32],zero,zmm0[34],zero,zero,zero,zmm0[38],zero,zmm0[40],zero,zero,zmm0[43,44],zero,zmm0[46],zero,zmm0[48],zero,zmm0[50],zero,zero,zero,zmm0[54],zero,zmm0[56],zero,zero,zmm0[59,60],zero,zmm0[62],zero sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %ret = select <64 x i1> , <64 x i8> %x, <64 x i8> zeroinitializer ret <64 x i8> %ret } define void @ktest_1(<8 x double> %in, double * %base) { -; CHECK-LABEL: ktest_1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovupd (%rdi), %zmm1 # sched: [8:0.50] -; CHECK-NEXT: vcmpltpd %zmm0, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} # sched: [3:1.00] -; CHECK-NEXT: ktestb %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: je .LBB410_2 # sched: [1:0.50] -; CHECK-NEXT: # BB#1: # %L1 -; CHECK-NEXT: vmovapd %zmm0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; CHECK-NEXT: .LBB410_2: # %L2 -; CHECK-NEXT: vmovapd %zmm0, 8(%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ktest_1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovupd (%rdi), %zmm1 # sched: [4:0.50] +; GENERIC-NEXT: vcmpltpd %zmm0, %zmm1, %k1 +; GENERIC-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} +; GENERIC-NEXT: ktestb %k0, %k0 +; GENERIC-NEXT: je .LBB410_2 # sched: [1:1.00] +; GENERIC-NEXT: # BB#1: # %L1 +; GENERIC-NEXT: vmovapd %zmm0, (%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; GENERIC-NEXT: .LBB410_2: # %L2 +; GENERIC-NEXT: vmovapd %zmm0, 8(%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: ktest_1: -; SKX: ## BB#0: -; SKX-NEXT: vmovupd (%rdi), %zmm1 -; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} -; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} -; SKX-NEXT: ktestb %k0, %k0 -; SKX-NEXT: je LBB41_2 -; SKX-NEXT: ## BB#1: ## %L1 -; SKX-NEXT: vmovapd %zmm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq -; SKX-NEXT: LBB41_2: ## %L2 -; SKX-NEXT: vmovapd %zmm0, 8(%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vmovupd (%rdi), %zmm1 # sched: [8:0.50] +; SKX-NEXT: vcmpltpd %zmm0, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} # sched: [3:1.00] +; SKX-NEXT: ktestb %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: je .LBB410_2 # sched: [1:0.50] +; SKX-NEXT: # BB#1: # %L1 +; SKX-NEXT: vmovapd %zmm0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] +; SKX-NEXT: .LBB410_2: # %L2 +; SKX-NEXT: vmovapd %zmm0, 8(%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %addr1 = getelementptr double, double * %base, i64 0 %addr2 = getelementptr double, double * %base, i64 1 @@ -6310,56 +7777,57 @@ End: define void @ktest_2(<32 x float> %in, float * %base) { ; -; CHECK-LABEL: ktest_2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovups (%rdi), %zmm2 # sched: [8:0.50] -; CHECK-NEXT: vmovups 64(%rdi), %zmm3 # sched: [8:0.50] -; CHECK-NEXT: vcmpltps %zmm0, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vcmpltps %zmm1, %zmm3, %k2 # sched: [3:1.00] -; CHECK-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [8:0.50] -; CHECK-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: kunpckwd %k1, %k2, %k0 # sched: [3:1.00] -; CHECK-NEXT: vcmpltps %zmm3, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vcmpltps %zmm2, %zmm1, %k2 # sched: [3:1.00] -; CHECK-NEXT: kunpckwd %k1, %k2, %k1 # sched: [3:1.00] -; CHECK-NEXT: kord %k1, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: ktestd %k0, %k0 # sched: [3:1.00] -; CHECK-NEXT: je .LBB411_2 # sched: [1:0.50] -; CHECK-NEXT: # BB#1: # %L1 -; CHECK-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vmovaps %zmm1, 64(%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] -; CHECK-NEXT: .LBB411_2: # %L2 -; CHECK-NEXT: vmovaps %zmm0, 4(%rdi) # sched: [1:1.00] -; CHECK-NEXT: vmovaps %zmm1, 68(%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: ktest_2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovups (%rdi), %zmm2 # sched: [4:0.50] +; GENERIC-NEXT: vmovups 64(%rdi), %zmm3 # sched: [4:0.50] +; GENERIC-NEXT: vcmpltps %zmm0, %zmm2, %k1 +; GENERIC-NEXT: vcmpltps %zmm1, %zmm3, %k2 +; GENERIC-NEXT: kunpckwd %k1, %k2, %k0 +; GENERIC-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [4:0.50] +; GENERIC-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [4:0.50] +; GENERIC-NEXT: vcmpltps %zmm3, %zmm0, %k1 +; GENERIC-NEXT: vcmpltps %zmm2, %zmm1, %k2 +; GENERIC-NEXT: kunpckwd %k1, %k2, %k1 +; GENERIC-NEXT: kord %k1, %k0, %k0 +; GENERIC-NEXT: ktestd %k0, %k0 +; GENERIC-NEXT: je .LBB411_2 # sched: [1:1.00] +; GENERIC-NEXT: # BB#1: # %L1 +; GENERIC-NEXT: vmovaps %zmm0, (%rdi) +; GENERIC-NEXT: vmovaps %zmm1, 64(%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; GENERIC-NEXT: .LBB411_2: # %L2 +; GENERIC-NEXT: vmovaps %zmm0, 4(%rdi) +; GENERIC-NEXT: vmovaps %zmm1, 68(%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: ktest_2: -; SKX: ## BB#0: -; SKX-NEXT: vmovups (%rdi), %zmm2 -; SKX-NEXT: vmovups 64(%rdi), %zmm3 -; SKX-NEXT: vcmpltps %zmm0, %zmm2, %k1 -; SKX-NEXT: vcmpltps %zmm1, %zmm3, %k2 -; SKX-NEXT: kunpckwd %k1, %k2, %k0 -; SKX-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} -; SKX-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} -; SKX-NEXT: vcmpltps %zmm3, %zmm0, %k1 -; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2 -; SKX-NEXT: kunpckwd %k1, %k2, %k1 -; SKX-NEXT: kord %k1, %k0, %k0 -; SKX-NEXT: ktestd %k0, %k0 -; SKX-NEXT: je LBB42_2 -; SKX-NEXT: ## BB#1: ## %L1 -; SKX-NEXT: vmovaps %zmm0, (%rdi) -; SKX-NEXT: vmovaps %zmm1, 64(%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq -; SKX-NEXT: LBB42_2: ## %L2 -; SKX-NEXT: vmovaps %zmm0, 4(%rdi) -; SKX-NEXT: vmovaps %zmm1, 68(%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vmovups (%rdi), %zmm2 # sched: [8:0.50] +; SKX-NEXT: vmovups 64(%rdi), %zmm3 # sched: [8:0.50] +; SKX-NEXT: vcmpltps %zmm0, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vcmpltps %zmm1, %zmm3, %k2 # sched: [3:1.00] +; SKX-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [8:0.50] +; SKX-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: kunpckwd %k1, %k2, %k0 # sched: [3:1.00] +; SKX-NEXT: vcmpltps %zmm3, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2 # sched: [3:1.00] +; SKX-NEXT: kunpckwd %k1, %k2, %k1 # sched: [3:1.00] +; SKX-NEXT: kord %k1, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: ktestd %k0, %k0 # sched: [3:1.00] +; SKX-NEXT: je .LBB411_2 # sched: [1:0.50] +; SKX-NEXT: # BB#1: # %L1 +; SKX-NEXT: vmovaps %zmm0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vmovaps %zmm1, 64(%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] +; SKX-NEXT: .LBB411_2: # %L2 +; SKX-NEXT: vmovaps %zmm0, 4(%rdi) # sched: [1:1.00] +; SKX-NEXT: vmovaps %zmm1, 68(%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %addr1 = getelementptr float, float * %base, i64 0 %addr2 = getelementptr float, float * %base, i64 1 @@ -6388,187 +7856,198 @@ End: } define <8 x i64> @load_8i1(<8 x i1>* %a) { -; CHECK-LABEL: load_8i1: -; CHECK: # BB#0: -; CHECK-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00] -; CHECK-NEXT: vpmovm2q %k0, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: load_8i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovb (%rdi), %k0 +; GENERIC-NEXT: vpmovm2q %k0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: load_8i1: -; SKX: ## BB#0: -; SKX-NEXT: kmovb (%rdi), %k0 +; SKX: # BB#0: +; SKX-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2q %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %b = load <8 x i1>, <8 x i1>* %a %c = sext <8 x i1> %b to <8 x i64> ret <8 x i64> %c } define <16 x i32> @load_16i1(<16 x i1>* %a) { -; CHECK-LABEL: load_16i1: -; CHECK: # BB#0: -; CHECK-NEXT: kmovw (%rdi), %k0 # sched: [7:1.00] -; CHECK-NEXT: vpmovm2d %k0, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: load_16i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovw (%rdi), %k0 +; GENERIC-NEXT: vpmovm2d %k0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: load_16i1: -; SKX: ## BB#0: -; SKX-NEXT: kmovw (%rdi), %k0 +; SKX: # BB#0: +; SKX-NEXT: kmovw (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2d %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %b = load <16 x i1>, <16 x i1>* %a %c = sext <16 x i1> %b to <16 x i32> ret <16 x i32> %c } define <2 x i16> @load_2i1(<2 x i1>* %a) { -; CHECK-LABEL: load_2i1: -; CHECK: # BB#0: -; CHECK-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00] -; CHECK-NEXT: vpmovm2q %k0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: load_2i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovb (%rdi), %k0 +; GENERIC-NEXT: vpmovm2q %k0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: load_2i1: -; SKX: ## BB#0: -; SKX-NEXT: kmovb (%rdi), %k0 +; SKX: # BB#0: +; SKX-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2q %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %b = load <2 x i1>, <2 x i1>* %a %c = sext <2 x i1> %b to <2 x i16> ret <2 x i16> %c } define <4 x i16> @load_4i1(<4 x i1>* %a) { -; CHECK-LABEL: load_4i1: -; CHECK: # BB#0: -; CHECK-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00] -; CHECK-NEXT: vpmovm2d %k0, %xmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: load_4i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovb (%rdi), %k0 +; GENERIC-NEXT: vpmovm2d %k0, %xmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: load_4i1: -; SKX: ## BB#0: -; SKX-NEXT: kmovb (%rdi), %k0 +; SKX: # BB#0: +; SKX-NEXT: kmovb (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2d %k0, %xmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %b = load <4 x i1>, <4 x i1>* %a %c = sext <4 x i1> %b to <4 x i16> ret <4 x i16> %c } define <32 x i16> @load_32i1(<32 x i1>* %a) { -; CHECK-LABEL: load_32i1: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd (%rdi), %k0 # sched: [7:1.00] -; CHECK-NEXT: vpmovm2w %k0, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: load_32i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd (%rdi), %k0 +; GENERIC-NEXT: vpmovm2w %k0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: load_32i1: -; SKX: ## BB#0: -; SKX-NEXT: kmovd (%rdi), %k0 +; SKX: # BB#0: +; SKX-NEXT: kmovd (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2w %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %b = load <32 x i1>, <32 x i1>* %a %c = sext <32 x i1> %b to <32 x i16> ret <32 x i16> %c } define <64 x i8> @load_64i1(<64 x i1>* %a) { -; CHECK-LABEL: load_64i1: -; CHECK: # BB#0: -; CHECK-NEXT: kmovq (%rdi), %k0 # sched: [7:1.00] -; CHECK-NEXT: vpmovm2b %k0, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: load_64i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovq (%rdi), %k0 +; GENERIC-NEXT: vpmovm2b %k0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: load_64i1: -; SKX: ## BB#0: -; SKX-NEXT: kmovq (%rdi), %k0 +; SKX: # BB#0: +; SKX-NEXT: kmovq (%rdi), %k0 # sched: [7:1.00] ; SKX-NEXT: vpmovm2b %k0, %zmm0 -; SKX-NEXT: retq +; SKX-NEXT: retq # sched: [7:1.00] %b = load <64 x i1>, <64 x i1>* %a %c = sext <64 x i1> %b to <64 x i8> ret <64 x i8> %c } define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) { -; CHECK-LABEL: store_8i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_8i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k0 +; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: store_8i1: -; SKX: ## BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k0 -; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] store <8 x i1> %v, <8 x i1>* %a ret void } define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) { -; CHECK-LABEL: store_8i1_1: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_8i1_1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovw2m %xmm0, %k0 +; GENERIC-NEXT: kmovb %k0, (%rdi) +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: store_8i1_1: -; SKX: ## BB#0: -; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 -; SKX-NEXT: vpmovw2m %xmm0, %k0 -; SKX-NEXT: kmovb %k0, (%rdi) -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllw $15, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %xmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovb %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %v1 = trunc <8 x i16> %v to <8 x i1> store <8 x i1> %v1, <8 x i1>* %a ret void } define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) { -; CHECK-LABEL: store_16i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_16i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %xmm0, %k0 +; GENERIC-NEXT: kmovw %k0, (%rdi) +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: store_16i1: -; SKX: ## BB#0: -; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 -; SKX-NEXT: vpmovb2m %xmm0, %k0 -; SKX-NEXT: kmovw %k0, (%rdi) -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllw $7, %xmm0, %xmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %xmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovw %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] store <16 x i1> %v, <16 x i1>* %a ret void } define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) { -; CHECK-LABEL: store_32i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %ymm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_32i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: vpmovb2m %ymm0, %k0 +; GENERIC-NEXT: kmovd %k0, (%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: store_32i1: -; SKX: ## BB#0: -; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 -; SKX-NEXT: vpmovb2m %ymm0, %k0 -; SKX-NEXT: kmovd %k0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %ymm0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] store <32 x i1> %v, <32 x i1>* %a ret void } define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) { -; CHECK-LABEL: store_32i1_1: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $15, %zmm0, %zmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovw2m %zmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_32i1_1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $15, %zmm0, %zmm0 +; GENERIC-NEXT: vpmovw2m %zmm0, %k0 +; GENERIC-NEXT: kmovd %k0, (%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: store_32i1_1: -; SKX: ## BB#0: -; SKX-NEXT: vpsllw $15, %zmm0, %zmm0 -; SKX-NEXT: vpmovw2m %zmm0, %k0 -; SKX-NEXT: kmovd %k0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllw $15, %zmm0, %zmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovw2m %zmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %v1 = trunc <32 x i16> %v to <32 x i1> store <32 x i1> %v1, <32 x i1>* %a ret void @@ -6577,41 +8056,43 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) { define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; -; CHECK-LABEL: store_64i1: -; CHECK: # BB#0: -; CHECK-NEXT: vpsllw $7, %zmm0, %zmm0 # sched: [1:0.50] -; CHECK-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovq %k0, (%rdi) # sched: [1:1.00] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: store_64i1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpsllw $7, %zmm0, %zmm0 +; GENERIC-NEXT: vpmovb2m %zmm0, %k0 +; GENERIC-NEXT: kmovq %k0, (%rdi) +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: store_64i1: -; SKX: ## BB#0: -; SKX-NEXT: vpsllw $7, %zmm0, %zmm0 -; SKX-NEXT: vpmovb2m %zmm0, %k0 -; SKX-NEXT: kmovq %k0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpsllw $7, %zmm0, %zmm0 # sched: [1:0.50] +; SKX-NEXT: vpmovb2m %zmm0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovq %k0, (%rdi) # sched: [1:1.00] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] store <64 x i1> %v, <64 x i1>* %a ret void } define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { -; CHECK-LABEL: test_bitcast_v8i1_zext: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kmovb %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: addl %eax, %eax # sched: [1:0.25] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_bitcast_v8i1_zext: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; GENERIC-NEXT: kmovb %k0, %eax +; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33] +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_bitcast_v8i1_zext: -; SKX: ## BB#0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 -; SKX-NEXT: kmovb %k0, %eax -; SKX-NEXT: addl %eax, %eax -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: kmovb %k0, %eax # sched: [3:1.00] +; SKX-NEXT: addl %eax, %eax # sched: [1:0.25] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %v1 = icmp eq <16 x i32> %a, zeroinitializer %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> %mask1 = bitcast <8 x i1> %mask to i8 @@ -6621,14 +8102,23 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) { } define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) { -; CHECK-LABEL: test_bitcast_v16i1_zext: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: kmovw %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: addl %eax, %eax # sched: [1:0.25] -; CHECK-NEXT: vzeroupper # sched: [4:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_bitcast_v16i1_zext: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 +; GENERIC-NEXT: kmovw %k0, %eax +; GENERIC-NEXT: addl %eax, %eax # sched: [1:0.33] +; GENERIC-NEXT: vzeroupper +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_bitcast_v16i1_zext: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: kmovw %k0, %eax # sched: [3:1.00] +; SKX-NEXT: addl %eax, %eax # sched: [1:0.25] +; SKX-NEXT: vzeroupper # sched: [4:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %v1 = icmp eq <16 x i32> %a, zeroinitializer %mask1 = bitcast <16 x i1> %v1 to i16 %val = zext i16 %mask1 to i32 @@ -6637,22 +8127,23 @@ define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) { } define i16 @test_v16i1_add(i16 %x, i16 %y) { -; CHECK-LABEL: test_v16i1_add: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %esi, %k1 # sched: [1:1.00] -; CHECK-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AX %AX %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_v16i1_add: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k0 +; GENERIC-NEXT: kmovd %esi, %k1 +; GENERIC-NEXT: kxorw %k1, %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_v16i1_add: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kxorw %k1, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] +; SKX-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: # kill: %AX %AX %EAX +; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> %m2 = add <16 x i1> %m0, %m1 @@ -6661,22 +8152,23 @@ define i16 @test_v16i1_add(i16 %x, i16 %y) { } define i16 @test_v16i1_sub(i16 %x, i16 %y) { -; CHECK-LABEL: test_v16i1_sub: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %esi, %k1 # sched: [1:1.00] -; CHECK-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AX %AX %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_v16i1_sub: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k0 +; GENERIC-NEXT: kmovd %esi, %k1 +; GENERIC-NEXT: kxorw %k1, %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_v16i1_sub: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kxorw %k1, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] +; SKX-NEXT: kxorw %k1, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: # kill: %AX %AX %EAX +; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> %m2 = sub <16 x i1> %m0, %m1 @@ -6685,22 +8177,23 @@ define i16 @test_v16i1_sub(i16 %x, i16 %y) { } define i16 @test_v16i1_mul(i16 %x, i16 %y) { -; CHECK-LABEL: test_v16i1_mul: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %esi, %k1 # sched: [1:1.00] -; CHECK-NEXT: kandw %k1, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AX %AX %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_v16i1_mul: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k0 +; GENERIC-NEXT: kmovd %esi, %k1 +; GENERIC-NEXT: kandw %k1, %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AX %AX %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_v16i1_mul: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kandw %k1, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: ## kill: %AX %AX %EAX -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] +; SKX-NEXT: kandw %k1, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: # kill: %AX %AX %EAX +; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i16 %x to <16 x i1> %m1 = bitcast i16 %y to <16 x i1> %m2 = mul <16 x i1> %m0, %m1 @@ -6709,22 +8202,23 @@ define i16 @test_v16i1_mul(i16 %x, i16 %y) { } define i8 @test_v8i1_add(i8 %x, i8 %y) { -; CHECK-LABEL: test_v8i1_add: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %esi, %k1 # sched: [1:1.00] -; CHECK-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AL %AL %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_v8i1_add: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k0 +; GENERIC-NEXT: kmovd %esi, %k1 +; GENERIC-NEXT: kxorb %k1, %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AL %AL %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_v8i1_add: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kxorb %k1, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] +; SKX-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: # kill: %AL %AL %EAX +; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> %m2 = add <8 x i1> %m0, %m1 @@ -6733,22 +8227,23 @@ define i8 @test_v8i1_add(i8 %x, i8 %y) { } define i8 @test_v8i1_sub(i8 %x, i8 %y) { -; CHECK-LABEL: test_v8i1_sub: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %esi, %k1 # sched: [1:1.00] -; CHECK-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AL %AL %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_v8i1_sub: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k0 +; GENERIC-NEXT: kmovd %esi, %k1 +; GENERIC-NEXT: kxorb %k1, %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AL %AL %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_v8i1_sub: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kxorb %k1, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] +; SKX-NEXT: kxorb %k1, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: # kill: %AL %AL %EAX +; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> %m2 = sub <8 x i1> %m0, %m1 @@ -6757,22 +8252,23 @@ define i8 @test_v8i1_sub(i8 %x, i8 %y) { } define i8 @test_v8i1_mul(i8 %x, i8 %y) { -; CHECK-LABEL: test_v8i1_mul: -; CHECK: # BB#0: -; CHECK-NEXT: kmovd %edi, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %esi, %k1 # sched: [1:1.00] -; CHECK-NEXT: kandb %k1, %k0, %k0 # sched: [1:1.00] -; CHECK-NEXT: kmovd %k0, %eax # sched: [3:1.00] -; CHECK-NEXT: # kill: %AL %AL %EAX -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_v8i1_mul: +; GENERIC: # BB#0: +; GENERIC-NEXT: kmovd %edi, %k0 +; GENERIC-NEXT: kmovd %esi, %k1 +; GENERIC-NEXT: kandb %k1, %k0, %k0 +; GENERIC-NEXT: kmovd %k0, %eax +; GENERIC-NEXT: # kill: %AL %AL %EAX +; GENERIC-NEXT: retq # sched: [1:1.00] +; ; SKX-LABEL: test_v8i1_mul: -; SKX: ## BB#0: -; SKX-NEXT: kmovd %edi, %k0 -; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kandb %k1, %k0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: ## kill: %AL %AL %EAX -; SKX-NEXT: retq +; SKX: # BB#0: +; SKX-NEXT: kmovd %edi, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %esi, %k1 # sched: [1:1.00] +; SKX-NEXT: kandb %k1, %k0, %k0 # sched: [1:1.00] +; SKX-NEXT: kmovd %k0, %eax # sched: [3:1.00] +; SKX-NEXT: # kill: %AL %AL %EAX +; SKX-NEXT: retq # sched: [7:1.00] %m0 = bitcast i8 %x to <8 x i1> %m1 = bitcast i8 %y to <8 x i1> %m2 = mul <8 x i1> %m0, %m1 @@ -6781,52 +8277,80 @@ define i8 @test_v8i1_mul(i8 %x, i8 %y) { } define <16 x i32> @_inreg16xi32(i32 %a) { -; CHECK-LABEL: _inreg16xi32: -; CHECK: # BB#0: -; CHECK-NEXT: vpbroadcastd %edi, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _inreg16xi32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpbroadcastd %edi, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _inreg16xi32: +; SKX: # BB#0: +; SKX-NEXT: vpbroadcastd %edi, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = insertelement <16 x i32> undef, i32 %a, i32 0 %c = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32> %c } define <8 x i64> @_inreg8xi64(i64 %a) { -; CHECK-LABEL: _inreg8xi64: -; CHECK: # BB#0: -; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _inreg8xi64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpbroadcastq %rdi, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _inreg8xi64: +; SKX: # BB#0: +; SKX-NEXT: vpbroadcastq %rdi, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = insertelement <8 x i64> undef, i64 %a, i32 0 %c = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer ret <8 x i64> %c } define <16 x float> @_ss16xfloat_v4(<4 x float> %a) { -; CHECK-LABEL: _ss16xfloat_v4: -; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _ss16xfloat_v4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _ss16xfloat_v4: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %b } define <16 x float> @_inreg16xfloat(float %a) { -; CHECK-LABEL: _inreg16xfloat: -; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _inreg16xfloat: +; GENERIC: # BB#0: +; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _inreg16xfloat: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %c } define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %mask1) { -; CHECK-LABEL: _ss16xfloat_mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _ss16xfloat_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _ss16xfloat_mask: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer @@ -6835,12 +8359,19 @@ define <16 x float> @_ss16xfloat_mask(float %a, <16 x float> %i, <16 x i32> %m } define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) { -; CHECK-LABEL: _ss16xfloat_maskz: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _ss16xfloat_maskz: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _ss16xfloat_maskz: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <16 x i32> %mask1, zeroinitializer %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer @@ -6849,10 +8380,15 @@ define <16 x float> @_ss16xfloat_maskz(float %a, <16 x i32> %mask1) { } define <16 x float> @_ss16xfloat_load(float* %a.ptr) { -; CHECK-LABEL: _ss16xfloat_load: -; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _ss16xfloat_load: +; GENERIC: # BB#0: +; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _ss16xfloat_load: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastss (%rdi), %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %a = load float, float* %a.ptr %b = insertelement <16 x float> undef, float %a, i32 0 %c = shufflevector <16 x float> %b, <16 x float> undef, <16 x i32> zeroinitializer @@ -6860,12 +8396,19 @@ define <16 x float> @_ss16xfloat_load(float* %a.ptr) { } define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 x i32> %mask1) { -; CHECK-LABEL: _ss16xfloat_mask_load: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _ss16xfloat_mask_load: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _ss16xfloat_mask_load: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %a = load float, float* %a.ptr %mask = icmp ne <16 x i32> %mask1, zeroinitializer %b = insertelement <16 x float> undef, float %a, i32 0 @@ -6875,12 +8418,19 @@ define <16 x float> @_ss16xfloat_mask_load(float* %a.ptr, <16 x float> %i, <16 } define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) { -; CHECK-LABEL: _ss16xfloat_maskz_load: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _ss16xfloat_maskz_load: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _ss16xfloat_maskz_load: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vbroadcastss (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %a = load float, float* %a.ptr %mask = icmp ne <16 x i32> %mask1, zeroinitializer %b = insertelement <16 x float> undef, float %a, i32 0 @@ -6890,23 +8440,36 @@ define <16 x float> @_ss16xfloat_maskz_load(float* %a.ptr, <16 x i32> %mask1) } define <8 x double> @_inreg8xdouble(double %a) { -; CHECK-LABEL: _inreg8xdouble: -; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _inreg8xdouble: +; GENERIC: # BB#0: +; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _inreg8xdouble: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer ret <8 x double> %c } define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %mask1) { -; CHECK-LABEL: _sd8xdouble_mask: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _sd8xdouble_mask: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _sd8xdouble_mask: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i32> %mask1, zeroinitializer %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer @@ -6915,12 +8478,19 @@ define <8 x double> @_sd8xdouble_mask(double %a, <8 x double> %i, <8 x i32> %m } define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) { -; CHECK-LABEL: _sd8xdouble_maskz: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _sd8xdouble_maskz: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _sd8xdouble_maskz: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %mask = icmp ne <8 x i32> %mask1, zeroinitializer %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer @@ -6929,10 +8499,15 @@ define <8 x double> @_sd8xdouble_maskz(double %a, <8 x i32> %mask1) { } define <8 x double> @_sd8xdouble_load(double* %a.ptr) { -; CHECK-LABEL: _sd8xdouble_load: -; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _sd8xdouble_load: +; GENERIC: # BB#0: +; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _sd8xdouble_load: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %a = load double, double* %a.ptr %b = insertelement <8 x double> undef, double %a, i32 0 %c = shufflevector <8 x double> %b, <8 x double> undef, <8 x i32> zeroinitializer @@ -6940,12 +8515,19 @@ define <8 x double> @_sd8xdouble_load(double* %a.ptr) { } define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 x i32> %mask1) { -; CHECK-LABEL: _sd8xdouble_mask_load: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _sd8xdouble_mask_load: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _sd8xdouble_mask_load: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %a = load double, double* %a.ptr %mask = icmp ne <8 x i32> %mask1, zeroinitializer %b = insertelement <8 x double> undef, double %a, i32 0 @@ -6955,12 +8537,19 @@ define <8 x double> @_sd8xdouble_mask_load(double* %a.ptr, <8 x double> %i, <8 } define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) { -; CHECK-LABEL: _sd8xdouble_maskz_load: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _sd8xdouble_maskz_load: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _sd8xdouble_maskz_load: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpneqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vbroadcastsd (%rdi), %zmm0 {%k1} {z} # sched: [8:0.50] +; SKX-NEXT: retq # sched: [7:1.00] %a = load double, double* %a.ptr %mask = icmp ne <8 x i32> %mask1, zeroinitializer %b = insertelement <8 x double> undef, double %a, i32 0 @@ -6970,32 +8559,51 @@ define <8 x double> @_sd8xdouble_maskz_load(double* %a.ptr, <8 x i32> %mask1) } define <16 x i32> @_xmm16xi32(<16 x i32> %a) { -; CHECK-LABEL: _xmm16xi32: -; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _xmm16xi32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _xmm16xi32: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32> %b } define <16 x float> @_xmm16xfloat(<16 x float> %a) { -; CHECK-LABEL: _xmm16xfloat: -; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _xmm16xfloat: +; GENERIC: # BB#0: +; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _xmm16xfloat: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer ret <16 x float> %b } define <16 x i32> @test_vbroadcast() { -; CHECK-LABEL: test_vbroadcast: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 # sched: [1:0.33] -; CHECK-NEXT: vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00] -; CHECK-NEXT: vpmovm2d %k0, %zmm0 -; CHECK-NEXT: knotw %k0, %k1 # sched: [1:1.00] -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_vbroadcast: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vxorps %xmm0, %xmm0, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: vcmpunordps %zmm0, %zmm0, %k0 +; GENERIC-NEXT: vpmovm2d %k0, %zmm0 +; GENERIC-NEXT: knotw %k0, %k1 +; GENERIC-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_vbroadcast: +; SKX: # BB#0: # %entry +; SKX-NEXT: vxorps %xmm0, %xmm0, %xmm0 # sched: [1:0.33] +; SKX-NEXT: vcmpunordps %zmm0, %zmm0, %k0 # sched: [3:1.00] +; SKX-NEXT: vpmovm2d %k0, %zmm0 +; SKX-NEXT: knotw %k0, %k1 # sched: [1:1.00] +; SKX-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = sext <16 x i1> zeroinitializer to <16 x i32> %1 = fcmp uno <16 x float> undef, zeroinitializer @@ -7007,10 +8615,15 @@ entry: ; We implement the set1 intrinsics with vector initializers. Verify that the ; IR generated will produce broadcasts at the end. define <8 x double> @test_set1_pd(double %d) #2 { -; CHECK-LABEL: test_set1_pd: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_set1_pd: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_set1_pd: +; SKX: # BB#0: # %entry +; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %vecinit.i = insertelement <8 x double> undef, double %d, i32 0 %vecinit1.i = insertelement <8 x double> %vecinit.i, double %d, i32 1 @@ -7024,10 +8637,15 @@ entry: } define <8 x i64> @test_set1_epi64(i64 %d) #2 { -; CHECK-LABEL: test_set1_epi64: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpbroadcastq %rdi, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_set1_epi64: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vpbroadcastq %rdi, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_set1_epi64: +; SKX: # BB#0: # %entry +; SKX-NEXT: vpbroadcastq %rdi, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %vecinit.i = insertelement <8 x i64> undef, i64 %d, i32 0 %vecinit1.i = insertelement <8 x i64> %vecinit.i, i64 %d, i32 1 @@ -7041,10 +8659,15 @@ entry: } define <16 x float> @test_set1_ps(float %f) #2 { -; CHECK-LABEL: test_set1_ps: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_set1_ps: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_set1_ps: +; SKX: # BB#0: # %entry +; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %vecinit.i = insertelement <16 x float> undef, float %f, i32 0 %vecinit1.i = insertelement <16 x float> %vecinit.i, float %f, i32 1 @@ -7066,10 +8689,15 @@ entry: } define <16 x i32> @test_set1_epi32(i32 %f) #2 { -; CHECK-LABEL: test_set1_epi32: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vpbroadcastd %edi, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_set1_epi32: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vpbroadcastd %edi, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_set1_epi32: +; SKX: # BB#0: # %entry +; SKX-NEXT: vpbroadcastd %edi, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %vecinit.i = insertelement <16 x i32> undef, i32 %f, i32 0 %vecinit1.i = insertelement <16 x i32> %vecinit.i, i32 %f, i32 1 @@ -7093,10 +8721,15 @@ entry: ; We implement the scalar broadcast intrinsics with vector initializers. ; Verify that the IR generated will produce the broadcast at the end. define <8 x double> @test_mm512_broadcastsd_pd(<2 x double> %a) { -; CHECK-LABEL: test_mm512_broadcastsd_pd: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: test_mm512_broadcastsd_pd: +; GENERIC: # BB#0: # %entry +; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_mm512_broadcastsd_pd: +; SKX: # BB#0: # %entry +; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] entry: %0 = extractelement <2 x double> %a, i32 0 %vecinit.i = insertelement <8 x double> undef, double %0, i32 0 @@ -7111,73 +8744,115 @@ entry: } define <16 x float> @suff_test1(<8 x float>%a) { -; CHECK-LABEL: suff_test1: -; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: suff_test1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: suff_test1: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> zeroinitializer ret <16 x float>%res } define <8 x double> @suff_test2(<4 x double>%a) { -; CHECK-LABEL: suff_test2: -; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: suff_test2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: suff_test2: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %a, <4 x double> undef, <8 x i32> zeroinitializer ret <8 x double>%res } define <64 x i8> @_invec32xi8(<32 x i8>%a) { -; CHECK-LABEL: _invec32xi8: -; CHECK: # BB#0: -; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _invec32xi8: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpbroadcastb %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _invec32xi8: +; SKX: # BB#0: +; SKX-NEXT: vpbroadcastb %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <32 x i8> %a, <32 x i8> undef, <64 x i32> zeroinitializer ret <64 x i8>%res } define <32 x i16> @_invec16xi16(<16 x i16>%a) { -; CHECK-LABEL: _invec16xi16: -; CHECK: # BB#0: -; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _invec16xi16: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpbroadcastw %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _invec16xi16: +; SKX: # BB#0: +; SKX-NEXT: vpbroadcastw %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i16> %a, <16 x i16> undef, <32 x i32> zeroinitializer ret <32 x i16>%res } define <16 x i32> @_invec8xi32(<8 x i32>%a) { -; CHECK-LABEL: _invec8xi32: -; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _invec8xi32: +; GENERIC: # BB#0: +; GENERIC-NEXT: vbroadcastss %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _invec8xi32: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastss %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32> zeroinitializer ret <16 x i32>%res } define <8 x i64> @_invec4xi64(<4 x i64>%a) { -; CHECK-LABEL: _invec4xi64: -; CHECK: # BB#0: -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: _invec4xi64: +; GENERIC: # BB#0: +; GENERIC-NEXT: vbroadcastsd %xmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: _invec4xi64: +; SKX: # BB#0: +; SKX-NEXT: vbroadcastsd %xmm0, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i64> %a, <4 x i64> undef, <8 x i32> zeroinitializer ret <8 x i64>%res } declare void @func_f32(float) define <16 x float> @broadcast_ss_spill(float %x) { -; CHECK-LABEL: broadcast_ss_spill: -; CHECK: # BB#0: -; CHECK-NEXT: subq $24, %rsp # sched: [1:0.25] -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00] -; CHECK-NEXT: # sched: [1:1.00] -; CHECK-NEXT: callq func_f32 -; CHECK-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50] -; CHECK-NEXT: # sched: [8:0.50] -; CHECK-NEXT: addq $24, %rsp # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: broadcast_ss_spill: +; GENERIC: # BB#0: +; GENERIC-NEXT: subq $24, %rsp # sched: [1:0.33] +; GENERIC-NEXT: .cfi_def_cfa_offset 32 +; GENERIC-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill sched: [5:1.00] +; GENERIC-NEXT: # sched: [5:1.00] +; GENERIC-NEXT: callq func_f32 +; GENERIC-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload +; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: broadcast_ss_spill: +; SKX: # BB#0: +; SKX-NEXT: subq $24, %rsp # sched: [1:0.25] +; SKX-NEXT: .cfi_def_cfa_offset 32 +; SKX-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00] +; SKX-NEXT: # sched: [1:1.00] +; SKX-NEXT: callq func_f32 +; SKX-NEXT: vbroadcastss (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50] +; SKX-NEXT: # sched: [8:0.50] +; SKX-NEXT: addq $24, %rsp # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %a = fadd float %x, %x call void @func_f32(float %a) %b = insertelement <16 x float> undef, float %a, i32 0 @@ -7187,18 +8862,30 @@ define <16 x float> @broadcast_ss_spill(float %x) { declare void @func_f64(double) define <8 x double> @broadcast_sd_spill(double %x) { -; CHECK-LABEL: broadcast_sd_spill: -; CHECK: # BB#0: -; CHECK-NEXT: subq $24, %rsp # sched: [1:0.25] -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [4:0.33] -; CHECK-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00] -; CHECK-NEXT: # sched: [1:1.00] -; CHECK-NEXT: callq func_f64 -; CHECK-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50] -; CHECK-NEXT: # sched: [8:0.50] -; CHECK-NEXT: addq $24, %rsp # sched: [1:0.25] -; CHECK-NEXT: ret{{[l|q]}} # sched: [7:1.00] +; GENERIC-LABEL: broadcast_sd_spill: +; GENERIC: # BB#0: +; GENERIC-NEXT: subq $24, %rsp # sched: [1:0.33] +; GENERIC-NEXT: .cfi_def_cfa_offset 32 +; GENERIC-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [3:1.00] +; GENERIC-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill sched: [5:1.00] +; GENERIC-NEXT: # sched: [5:1.00] +; GENERIC-NEXT: callq func_f64 +; GENERIC-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload +; GENERIC-NEXT: addq $24, %rsp # sched: [1:0.33] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: broadcast_sd_spill: +; SKX: # BB#0: +; SKX-NEXT: subq $24, %rsp # sched: [1:0.25] +; SKX-NEXT: .cfi_def_cfa_offset 32 +; SKX-NEXT: vaddsd %xmm0, %xmm0, %xmm0 # sched: [4:0.33] +; SKX-NEXT: vmovapd %xmm0, (%rsp) # 16-byte Spill sched: [1:1.00] +; SKX-NEXT: # sched: [1:1.00] +; SKX-NEXT: callq func_f64 +; SKX-NEXT: vbroadcastsd (%rsp), %zmm0 # 16-byte Folded Reload sched: [8:0.50] +; SKX-NEXT: # sched: [8:0.50] +; SKX-NEXT: addq $24, %rsp # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %a = fadd double %x, %x call void @func_f64(double %a) %b = insertelement <8 x double> undef, double %a, i32 0 diff --git a/test/CodeGen/X86/avx512-shuffle-schedule.ll b/test/CodeGen/X86/avx512-shuffle-schedule.ll index 9aeb47bdc82..c59fb5b97bc 100755 --- a/test/CodeGen/X86/avx512-shuffle-schedule.ll +++ b/test/CodeGen/X86/avx512-shuffle-schedule.ll @@ -1,25 +1,42 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=x86-64 -mattr=+avx512f,+avx512dq,+avx512bw,+avx512vl | FileCheck %s --check-prefix=GENERIC +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skx | FileCheck %s --check-prefix=SKX + ; This test is an assembly of avx512 shuffling instructions to check their scheduling define <16 x i16> @test_16xi16_perm_mask0(<16 x i16> %vec) { -; CHECK-LABEL: test_16xi16_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] -; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi16_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] +; GENERIC-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi16_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] +; SKX-NEXT: vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 @@ -27,27 +44,44 @@ define <16 x i16> @test_masked_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %ve } define <16 x i16> @test_masked_z_16xi16_perm_mask0(<16 x i16> %vec, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 @@ -55,27 +89,44 @@ define <16 x i16> @test_masked_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %ve } define <16 x i16> @test_masked_z_16xi16_perm_mask1(<16 x i16> %vec, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 @@ -83,36 +134,59 @@ define <16 x i16> @test_masked_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %ve } define <16 x i16> @test_masked_z_16xi16_perm_mask2(<16 x i16> %vec, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_16xi16_perm_mask3(<16 x i16> %vec) { -; CHECK-LABEL: test_16xi16_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] -; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi16_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] +; GENERIC-NEXT: vpermw %ymm0, %ymm1, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi16_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] +; SKX-NEXT: vpermw %ymm0, %ymm1, %ymm0 # sched: [6:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %ymm0, %ymm3, %ymm1 {%k1} # sched: [6:2.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 @@ -120,36 +194,58 @@ define <16 x i16> @test_masked_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %ve } define <16 x i16> @test_masked_z_16xi16_perm_mask3(<16 x i16> %vec, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [6:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_16xi16_perm_mem_mask0(<16 x i16>* %vp) { -; CHECK-LABEL: test_16xi16_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] -; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi16_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] +; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi16_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm0 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] +; SKX-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -158,13 +254,21 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> } define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -173,13 +277,21 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask0(<16 x i16>* %vp, <16 x i1 } define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -188,13 +300,21 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> } define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -203,13 +323,21 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask1(<16 x i16>* %vp, <16 x i1 } define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -218,13 +346,21 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> } define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -233,23 +369,37 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask2(<16 x i16>* %vp, <16 x i1 } define <16 x i16> @test_16xi16_perm_mem_mask3(<16 x i16>* %vp) { -; CHECK-LABEL: test_16xi16_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] -; CHECK-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi16_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] +; GENERIC-NEXT: vpermw (%rdi), %ymm0, %ymm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi16_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm0 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] +; SKX-NEXT: vpermw (%rdi), %ymm0, %ymm0 # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %ymm2, %ymm0 {%k1} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -258,13 +408,21 @@ define <16 x i16> @test_masked_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> } define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -273,23 +431,38 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i1 } define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) { -; CHECK-LABEL: test_32xi16_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50] -; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi16_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50] +; GENERIC-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi16_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50] +; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2 @@ -297,27 +470,44 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve } define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2 @@ -325,27 +515,44 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve } define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2 @@ -353,36 +560,59 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve } define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer ret <32 x i16> %res } define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) { -; CHECK-LABEL: test_32xi16_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50] -; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi16_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50] +; GENERIC-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi16_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50] +; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [6:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [6:2.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2 @@ -390,36 +620,58 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve } define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [6:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer ret <32 x i16> %res } define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) { -; CHECK-LABEL: test_32xi16_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50] -; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi16_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50] +; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi16_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50] +; SKX-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -428,13 +680,21 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> } define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -443,13 +703,21 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i1 } define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -458,13 +726,21 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> } define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -473,13 +749,21 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i1 } define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -488,13 +772,21 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> } define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -503,23 +795,37 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i1 } define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) { -; CHECK-LABEL: test_32xi16_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50] -; CHECK-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi16_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50] +; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi16_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50] +; SKX-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -528,13 +834,21 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> } define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [13:2.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -543,23 +857,38 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i1 } define <8 x i32> @test_8xi32_perm_mask0(<8 x i32> %vec) { -; CHECK-LABEL: test_8xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50] +; GENERIC-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [4,2,0,6,7,2,3,6] sched: [7:0.50] +; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res } define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [4,2,0,6,7,2,3,6] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 @@ -567,27 +896,44 @@ define <8 x i32> @test_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, } define <8 x i32> @test_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,2,0,6,7,2,3,6] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xi32_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi32_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi32_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,5,1,2,6,0,0,3] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 @@ -595,27 +941,44 @@ define <8 x i32> @test_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, } define <8 x i32> @test_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xi32_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi32_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi32_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [0,5,1,2,6,0,0,3] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xi32_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi32_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi32_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [3,6,5,5,1,7,3,4] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 @@ -623,36 +986,59 @@ define <8 x i32> @test_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, } define <8 x i32> @test_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xi32_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi32_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi32_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,6,5,5,1,7,3,4] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_8xi32_perm_mask3(<8 x i32> %vec) { -; CHECK-LABEL: test_8xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50] +; GENERIC-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [3,0,3,1,0,4,5,0] sched: [7:0.50] +; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res } define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm3 = [3,0,3,1,0,4,5,0] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 @@ -660,36 +1046,58 @@ define <8 x i32> @test_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, } define <8 x i32> @test_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,0,3,1,0,4,5,0] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_8xi32_perm_mem_mask0(<8 x i32>* %vp) { -; CHECK-LABEL: test_8xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50] -; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50] +; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm0 = [3,7,4,3,5,2,0,5] sched: [7:0.50] +; SKX-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res } define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [3,7,4,3,5,2,0,5] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -698,13 +1106,21 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %ve } define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [3,7,4,3,5,2,0,5] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -713,13 +1129,21 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> % } define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xi32_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi32_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [4,6,1,7,6,7,6,5] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -728,13 +1152,21 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %ve } define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [4,6,1,7,6,7,6,5] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -743,13 +1175,21 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> % } define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xi32_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi32_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [6,4,6,1,6,3,6,3] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -758,13 +1198,21 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %ve } define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,4,6,1,6,3,6,3] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -773,23 +1221,37 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> % } define <8 x i32> @test_8xi32_perm_mem_mask3(<8 x i32>* %vp) { -; CHECK-LABEL: test_8xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50] -; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50] +; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm0 = [6,0,0,7,3,7,7,5] sched: [7:0.50] +; SKX-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res } define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm2 = [6,0,0,7,3,7,7,5] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -798,13 +1260,21 @@ define <8 x i32> @test_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %ve } define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa {{.*#+}} ymm1 = [6,0,0,7,3,7,7,5] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -813,23 +1283,38 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> % } define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50] +; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50] +; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res } define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2 @@ -837,27 +1322,44 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve } define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer ret <16 x i32> %res } define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xi32_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi32_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi32_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2 @@ -865,27 +1367,44 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve } define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xi32_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi32_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi32_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer ret <16 x i32> %res } define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xi32_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi32_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi32_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2 @@ -893,36 +1412,59 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve } define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xi32_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi32_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi32_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer ret <16 x i32> %res } define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { -; CHECK-LABEL: test_16xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50] +; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50] +; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res } define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2 @@ -930,36 +1472,58 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve } define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer ret <16 x i32> %res } define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) { -; CHECK-LABEL: test_16xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50] -; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50] +; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50] +; SKX-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res } define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -968,13 +1532,21 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> } define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -983,13 +1555,21 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i3 } define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xi32_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi32_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -998,13 +1578,21 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> } define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -1013,13 +1601,21 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i3 } define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xi32_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi32_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -1028,13 +1624,21 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> } define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -1043,23 +1647,37 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i3 } define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) { -; CHECK-LABEL: test_16xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50] -; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50] +; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50] +; SKX-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res } define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -1068,13 +1686,21 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> } define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa32 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -1083,21 +1709,34 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i3 } define <4 x i64> @test_4xi64_perm_mask0(<4 x i64> %vec) { -; CHECK-LABEL: test_4xi64_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,0,3,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> ret <4 x i64> %res } define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xi64_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi64_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi64_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,0,3,1] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 @@ -1105,25 +1744,40 @@ define <4 x i64> @test_masked_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %vec2, } define <4 x i64> @test_masked_z_4xi64_perm_mask0(<4 x i64> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xi64_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi64_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi64_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,0,3,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xi64_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi64_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi64_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 @@ -1131,25 +1785,40 @@ define <4 x i64> @test_masked_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %vec2, } define <4 x i64> @test_masked_z_4xi64_perm_mask1(<4 x i64> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xi64_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi64_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi64_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xi64_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi64_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi64_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,2,2,1] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 @@ -1157,33 +1826,53 @@ define <4 x i64> @test_masked_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %vec2, } define <4 x i64> @test_masked_z_4xi64_perm_mask2(<4 x i64> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xi64_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi64_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi64_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,2,2,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_4xi64_perm_mask3(<4 x i64> %vec) { -; CHECK-LABEL: test_4xi64_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> ret <4 x i64> %res } define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xi64_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi64_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi64_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec2 @@ -1191,33 +1880,52 @@ define <4 x i64> @test_masked_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %vec2, } define <4 x i64> @test_masked_z_4xi64_perm_mask3(<4 x i64> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xi64_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi64_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi64_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,3] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_4xi64_perm_mem_mask0(<4 x i64>* %vp) { -; CHECK-LABEL: test_4xi64_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,1,2,0] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> ret <4 x i64> %res } define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xi64_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi64_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,2,0] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1226,12 +1934,19 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %ve } define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,2,0] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1240,12 +1955,19 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask0(<4 x i64>* %vp, <4 x i64> % } define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xi64_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi64_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,1,1,1] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1254,12 +1976,19 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %ve } define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,1,1,1] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1268,12 +1997,19 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask1(<4 x i64>* %vp, <4 x i64> % } define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xi64_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi64_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[0,1,2,0] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1282,12 +2018,19 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %ve } define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,0] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1296,21 +2039,33 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask2(<4 x i64>* %vp, <4 x i64> % } define <4 x i64> @test_4xi64_perm_mem_mask3(<4 x i64>* %vp) { -; CHECK-LABEL: test_4xi64_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermpd {{.*#+}} ymm0 = mem[2,0,1,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp %res = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> ret <4 x i64> %res } define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xi64_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi64_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi64_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} = mem[2,0,1,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1319,12 +2074,19 @@ define <4 x i64> @test_masked_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %ve } define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xi64_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi64_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi64_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = mem[2,0,1,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i64>, <4 x i64>* %vp %shuf = shufflevector <4 x i64> %vec, <4 x i64> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -1333,23 +2095,38 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> % } define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) { -; CHECK-LABEL: test_8xi64_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [8:0.50] -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [4:0.50] +; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [8:0.50] +; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> ret <8 x i64> %res } define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2 @@ -1357,26 +2134,42 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, } define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer ret <8 x i64> %res } define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_imm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_imm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2 @@ -1384,26 +2177,42 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %ve } define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[1,0,1,1,5,4,5,5] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer ret <8 x i64> %res } define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2 @@ -1411,34 +2220,55 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, } define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer ret <8 x i64> %res } define <8 x i64> @test_8xi64_perm_imm_mask3(<8 x i64> %vec) { -; CHECK-LABEL: test_8xi64_perm_imm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_perm_imm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_perm_imm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> ret <8 x i64> %res } define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_imm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_imm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2 @@ -1446,26 +2276,42 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %ve } define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,1,7,5,7,5] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer ret <8 x i64> %res } define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_mask4: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2 @@ -1473,26 +2319,42 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, } define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_mask4: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer ret <8 x i64> %res } define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_imm_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_imm_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2 @@ -1500,35 +2362,57 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %ve } define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,0,0,4,4,4,4] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer ret <8 x i64> %res } define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) { -; CHECK-LABEL: test_8xi64_perm_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [8:0.50] -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_perm_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [4:0.50] +; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_perm_mask6: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [8:0.50] +; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> ret <8 x i64> %res } define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_mask6: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2 @@ -1536,26 +2420,42 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, } define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_mask6: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer ret <8 x i64> %res } define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_imm_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_imm_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_imm_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec2 @@ -1563,35 +2463,56 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %ve } define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_imm_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,3,3,7,7,7,7] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer ret <8 x i64> %res } define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) { -; CHECK-LABEL: test_8xi64_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [8:0.50] -; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [4:0.50] +; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [8:0.50] +; SKX-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> ret <8 x i64> %res } define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1600,13 +2521,21 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %ve } define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1615,12 +2544,19 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> % } define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1629,12 +2565,19 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> } define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,1,1,0,5,5,5,4] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1643,13 +2586,21 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i6 } define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1658,13 +2609,21 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %ve } define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1673,21 +2632,33 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> % } define <8 x i64> @test_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp) { -; CHECK-LABEL: test_8xi64_perm_imm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_perm_imm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_perm_imm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermpd {{.*#+}} zmm0 = mem[1,3,1,1,5,7,5,5] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> ret <8 x i64> %res } define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1696,12 +2667,19 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> } define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[1,3,1,1,5,7,5,5] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1710,13 +2688,21 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i6 } define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_mem_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_mem_mask4: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1725,13 +2711,21 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %ve } define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask4: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1740,12 +2734,19 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> % } define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1754,12 +2755,19 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> } define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,1,0,0,7,5,4,4] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1768,23 +2776,37 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i6 } define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) { -; CHECK-LABEL: test_8xi64_perm_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [8:0.50] -; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_perm_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [4:0.50] +; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_perm_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [8:0.50] +; SKX-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %res = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> ret <8 x i64> %res } define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1793,13 +2815,21 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %ve } define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1808,12 +2838,19 @@ define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> % } define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xi64_perm_imm_mem_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi64_perm_imm_mem_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi64_perm_imm_mem_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1822,12 +2859,19 @@ define <8 x i64> @test_masked_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> } define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi64_perm_imm_mem_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermq {{.*#+}} zmm0 {%k1} {z} = mem[3,0,0,1,7,4,4,5] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i64>, <8 x i64>* %vp %shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -1836,23 +2880,38 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask7(<8 x i64>* %vp, <8 x i6 } define <8 x float> @test_8xfloat_perm_mask0(<8 x float> %vec) { -; CHECK-LABEL: test_8xfloat_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50] +; GENERIC-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [3,4,2,4,1,2,3,4] sched: [7:0.50] +; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> ret <8 x float> %res } define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xfloat_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xfloat_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xfloat_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [3,4,2,4,1,2,3,4] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 @@ -1860,27 +2919,44 @@ define <8 x float> @test_masked_8xfloat_perm_mask0(<8 x float> %vec, <8 x float> } define <8 x float> @test_masked_z_8xfloat_perm_mask0(<8 x float> %vec, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xfloat_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xfloat_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [3,4,2,4,1,2,3,4] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xfloat_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xfloat_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xfloat_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [4,2,1,0,6,0,5,1] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 @@ -1888,27 +2964,44 @@ define <8 x float> @test_masked_8xfloat_perm_mask1(<8 x float> %vec, <8 x float> } define <8 x float> @test_masked_z_8xfloat_perm_mask1(<8 x float> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xfloat_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xfloat_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [4,2,1,0,6,0,5,1] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xfloat_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xfloat_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xfloat_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,5,5,4,6,0,5] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 @@ -1916,36 +3009,59 @@ define <8 x float> @test_masked_8xfloat_perm_mask2(<8 x float> %vec, <8 x float> } define <8 x float> @test_masked_z_8xfloat_perm_mask2(<8 x float> %vec, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xfloat_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xfloat_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,5,5,4,6,0,5] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_8xfloat_perm_mask3(<8 x float> %vec) { -; CHECK-LABEL: test_8xfloat_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50] -; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50] +; GENERIC-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [0,5,2,5,5,5,1,6] sched: [7:0.50] +; SKX-NEXT: vpermps %ymm0, %ymm1, %ymm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> ret <8 x float> %res } define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xfloat_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xfloat_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 +; GENERIC-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xfloat_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm3 = [0,5,2,5,5,5,1,6] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %ymm0, %ymm3, %ymm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec2 @@ -1953,36 +3069,58 @@ define <8 x float> @test_masked_8xfloat_perm_mask3(<8 x float> %vec, <8 x float> } define <8 x float> @test_masked_z_8xfloat_perm_mask3(<8 x float> %vec, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xfloat_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xfloat_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xfloat_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [0,5,2,5,5,5,1,6] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %ymm0, %ymm2, %ymm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_8xfloat_perm_mem_mask0(<8 x float>* %vp) { -; CHECK-LABEL: test_8xfloat_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50] -; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50] +; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm0 = [5,2,1,6,4,2,4,0] sched: [7:0.50] +; SKX-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> ret <8 x float> %res } define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xfloat_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [5,2,1,6,4,2,4,0] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -1991,13 +3129,21 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x fl } define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [5,2,1,6,4,2,4,0] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -2006,13 +3152,21 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask0(<8 x float>* %vp, <8 x } define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xfloat_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,7,4,0,6,6,6] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -2021,13 +3175,21 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x fl } define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [1,3,7,4,0,6,6,6] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -2036,13 +3198,21 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask1(<8 x float>* %vp, <8 x } define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xfloat_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [4,5,1,5,6,6,2,4] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -2051,13 +3221,21 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x fl } define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [4,5,1,5,6,6,2,4] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -2066,23 +3244,37 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask2(<8 x float>* %vp, <8 x } define <8 x float> @test_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50] -; CHECK-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50] +; GENERIC-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm0 = [5,7,0,6,4,2,3,0] sched: [7:0.50] +; SKX-NEXT: vpermps (%rdi), %ymm0, %ymm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp %res = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> ret <8 x float> %res } define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_8xfloat_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xfloat_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xfloat_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm2 = [5,7,0,6,4,2,3,0] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %ymm2, %ymm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -2091,13 +3283,21 @@ define <8 x float> @test_masked_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x fl } define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test_masked_z_8xfloat_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xfloat_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xfloat_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} ymm1 = [5,7,0,6,4,2,3,0] sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %ymm1, %ymm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x float>, <8 x float>* %vp %shuf = shufflevector <8 x float> %vec, <8 x float> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -2106,23 +3306,38 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x } define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) { -; CHECK-LABEL: test_16xfloat_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50] +; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50] +; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> ret <16 x float> %res } define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xfloat_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xfloat_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xfloat_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 @@ -2130,27 +3345,44 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x fl } define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xfloat_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xfloat_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xfloat_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xfloat_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xfloat_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 @@ -2158,27 +3390,44 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x fl } define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xfloat_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xfloat_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xfloat_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xfloat_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xfloat_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 @@ -2186,36 +3435,59 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x fl } define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xfloat_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xfloat_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) { -; CHECK-LABEL: test_16xfloat_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50] -; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50] +; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50] +; SKX-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> ret <16 x float> %res } define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xfloat_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xfloat_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xfloat_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec2 @@ -2223,36 +3495,58 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x fl } define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xfloat_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xfloat_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) { -; CHECK-LABEL: test_16xfloat_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50] -; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50] +; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50] +; SKX-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> ret <16 x float> %res } define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xfloat_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -2261,13 +3555,21 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 } define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -2276,13 +3578,21 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <1 } define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xfloat_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -2291,13 +3601,21 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 } define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -2306,13 +3624,21 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <1 } define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xfloat_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -2321,13 +3647,21 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 } define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -2336,23 +3670,37 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <1 } define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) { -; CHECK-LABEL: test_16xfloat_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50] -; CHECK-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50] +; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50] +; SKX-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp %res = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> ret <16 x float> %res } define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_16xfloat_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xfloat_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -2361,13 +3709,21 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 } define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x i32> %mask) { -; CHECK-LABEL: test_masked_z_16xfloat_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xfloat_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x float>, <16 x float>* %vp %shuf = shufflevector <16 x float> %vec, <16 x float> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -2376,21 +3732,34 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <1 } define <4 x double> @test_4xdouble_perm_mask0(<4 x double> %vec) { -; CHECK-LABEL: test_4xdouble_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,2] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> ret <4 x double> %res } define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xdouble_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xdouble_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xdouble_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,1,3,2] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 @@ -2398,25 +3767,40 @@ define <4 x double> @test_masked_4xdouble_perm_mask0(<4 x double> %vec, <4 x dou } define <4 x double> @test_masked_z_4xdouble_perm_mask0(<4 x double> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xdouble_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xdouble_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,1,3,2] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xdouble_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xdouble_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xdouble_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,0,0,0] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 @@ -2424,25 +3808,40 @@ define <4 x double> @test_masked_4xdouble_perm_mask1(<4 x double> %vec, <4 x dou } define <4 x double> @test_masked_z_4xdouble_perm_mask1(<4 x double> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xdouble_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xdouble_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,0,0,0] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xdouble_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xdouble_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xdouble_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,1] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 @@ -2450,33 +3849,53 @@ define <4 x double> @test_masked_4xdouble_perm_mask2(<4 x double> %vec, <4 x dou } define <4 x double> @test_masked_z_4xdouble_perm_mask2(<4 x double> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xdouble_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xdouble_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_perm_mask3(<4 x double> %vec) { -; CHECK-LABEL: test_4xdouble_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[3,3,3,2] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> ret <4 x double> %res } define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xdouble_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xdouble_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xdouble_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,2] sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec2 @@ -2484,33 +3903,52 @@ define <4 x double> @test_masked_4xdouble_perm_mask3(<4 x double> %vec, <4 x dou } define <4 x double> @test_masked_z_4xdouble_perm_mask3(<4 x double> %vec, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xdouble_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xdouble_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xdouble_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,2] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_perm_mem_mask0(<4 x double>* %vp) { -; CHECK-LABEL: test_4xdouble_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,0,2,0] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> ret <4 x double> %res } define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xdouble_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,0,2,0] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2519,12 +3957,19 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x } define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,0,2,0] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2533,12 +3978,19 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask0(<4 x double>* %vp, <4 } define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xdouble_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[0,2,3,2] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2547,12 +3999,19 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x } define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[0,2,3,2] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2561,12 +4020,19 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask1(<4 x double>* %vp, <4 } define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xdouble_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,1,1,1] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2575,12 +4041,19 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x } define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,1,1,1] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2589,21 +4062,33 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask2(<4 x double>* %vp, <4 } define <4 x double> @test_4xdouble_perm_mem_mask3(<4 x double>* %vp) { -; CHECK-LABEL: test_4xdouble_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermpd {{.*#+}} ymm0 = mem[3,2,3,2] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp %res = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> ret <4 x double> %res } define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_4xdouble_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xdouble_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xdouble_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} = mem[3,2,3,2] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2612,12 +4097,19 @@ define <4 x double> @test_masked_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x } define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 x i64> %mask) { -; CHECK-LABEL: test_masked_z_4xdouble_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xdouble_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xdouble_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,2] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x double>, <4 x double>* %vp %shuf = shufflevector <4 x double> %vec, <4 x double> undef, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -2626,23 +4118,38 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 } define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) { -; CHECK-LABEL: test_8xdouble_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [8:0.50] -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [4:0.50] +; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [8:0.50] +; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> ret <8 x double> %res } define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 @@ -2650,26 +4157,42 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou } define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_imm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 @@ -2677,26 +4200,42 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x } define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,0,2,7,4,4,6] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 @@ -2704,34 +4243,55 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou } define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_perm_imm_mask3(<8 x double> %vec) { -; CHECK-LABEL: test_8xdouble_perm_imm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_perm_imm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_perm_imm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermpd {{.*#+}} zmm0 = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> ret <8 x double> %res } define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_imm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 @@ -2739,26 +4299,42 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x } define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_mask4: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 @@ -2766,26 +4342,42 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou } define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_mask4: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_imm_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 @@ -2793,35 +4385,57 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x } define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,2,3,7,7,6,7] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) { -; CHECK-LABEL: test_8xdouble_perm_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [8:0.50] -; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_perm_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [4:0.50] +; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_perm_mask6: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [8:0.50] +; SKX-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> ret <8 x double> %res } define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 +; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_mask6: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 @@ -2829,26 +4443,42 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou } define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_mask6: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_imm_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_imm_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec2 @@ -2856,35 +4486,56 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x } define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,2,7,5,7,6] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) { -; CHECK-LABEL: test_8xdouble_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [8:0.50] -; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [4:0.50] +; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [8:0.50] +; SKX-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> ret <8 x double> %res } define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2893,13 +4544,21 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x } define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2908,12 +4567,19 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 } define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2922,12 +4588,19 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, } define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,2,0,3,4,6,4,7] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2936,13 +4609,21 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp } define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2951,13 +4632,21 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x } define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2966,21 +4655,33 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 } define <8 x double> @test_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp) { -; CHECK-LABEL: test_8xdouble_perm_imm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_perm_imm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_perm_imm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermpd {{.*#+}} zmm0 = mem[2,1,1,0,6,5,5,4] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> ret <8 x double> %res } define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -2989,12 +4690,19 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, } define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,1,1,0,6,5,5,4] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -3003,13 +4711,21 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp } define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_mem_mask4: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -3018,13 +4734,21 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x } define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask4: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -3033,12 +4757,19 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 } define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -3047,12 +4778,19 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, } define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[2,2,2,3,6,6,6,7] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -3061,23 +4799,37 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp } define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) { -; CHECK-LABEL: test_8xdouble_perm_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [8:0.50] -; CHECK-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_perm_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [4:0.50] +; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_perm_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [8:0.50] +; SKX-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %res = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> ret <8 x double> %res } define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -3086,13 +4838,21 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x } define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -3101,12 +4861,19 @@ define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 } define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_8xdouble_perm_imm_mem_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xdouble_perm_imm_mem_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xdouble_perm_imm_mem_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -3115,12 +4882,19 @@ define <8 x double> @test_masked_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, } define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp, <8 x i64> %mask) { -; CHECK-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xdouble_perm_imm_mem_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpermpd {{.*#+}} zmm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -3129,21 +4903,34 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask7(<8 x double>* %vp } define <16 x i8> @test_16xi8_perm_mask0(<16 x i8> %vec) { -; CHECK-LABEL: test_16xi8_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi8_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi8_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> ret <16 x i8> %res } define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_16xi8_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi8_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi8_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2 @@ -3151,25 +4938,40 @@ define <16 x i8> @test_masked_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %vec2, } define <16 x i8> @test_masked_z_16xi8_perm_mask0(<16 x i8> %vec, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_z_16xi8_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi8_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi8_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[8,6,12,4,7,9,14,8,4,12,9,4,14,15,12,14] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer ret <16 x i8> %res } define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_16xi8_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi8_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi8_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2 @@ -3177,25 +4979,40 @@ define <16 x i8> @test_masked_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %vec2, } define <16 x i8> @test_masked_z_16xi8_perm_mask1(<16 x i8> %vec, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_z_16xi8_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi8_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi8_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[4,11,14,10,7,1,6,9,14,15,7,13,4,12,8,0] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer ret <16 x i8> %res } define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_16xi8_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi8_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi8_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2 @@ -3203,33 +5020,53 @@ define <16 x i8> @test_masked_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %vec2, } define <16 x i8> @test_masked_z_16xi8_perm_mask2(<16 x i8> %vec, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_z_16xi8_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi8_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi8_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[11,6,13,10,0,7,13,3,5,13,3,9,3,15,12,7] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer ret <16 x i8> %res } define <16 x i8> @test_16xi8_perm_mask3(<16 x i8> %vec) { -; CHECK-LABEL: test_16xi8_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi8_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi8_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> ret <16 x i8> %res } define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_16xi8_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi8_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi8_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm1 {%k1} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> %vec2 @@ -3237,35 +5074,56 @@ define <16 x i8> @test_masked_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %vec2, } define <16 x i8> @test_masked_z_16xi8_perm_mask3(<16 x i8> %vec, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_z_16xi8_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi8_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi8_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm0[1,5,8,14,1,8,11,8,13,8,15,9,9,7,9,6] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i8> %shuf, <16 x i8> zeroinitializer ret <16 x i8> %res } define <16 x i8> @test_16xi8_perm_mem_mask0(<16 x i8>* %vp) { -; CHECK-LABEL: test_16xi8_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi8_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi8_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> ret <16 x i8> %res } define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_16xi8_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi8_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer @@ -3274,13 +5132,21 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %ve } define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,10,7,1,12,14,14,13,14,14,8,6,11,4,12,13] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer @@ -3289,13 +5155,21 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask0(<16 x i8>* %vp, <16 x i8> % } define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_16xi8_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi8_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer @@ -3304,13 +5178,21 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %ve } define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[14,9,15,9,7,10,15,14,12,1,9,7,10,13,3,11] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer @@ -3319,13 +5201,21 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask1(<16 x i8>* %vp, <16 x i8> % } define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_16xi8_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi8_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer @@ -3334,13 +5224,21 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %ve } define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[1,3,12,5,13,1,2,11,0,9,14,8,10,0,10,9] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer @@ -3349,23 +5247,37 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask2(<16 x i8>* %vp, <16 x i8> % } define <16 x i8> @test_16xi8_perm_mem_mask3(<16 x i8>* %vp) { -; CHECK-LABEL: test_16xi8_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi8_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi8_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %xmm0 # sched: [6:0.50] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp %res = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> ret <16 x i8> %res } define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %vec2, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_16xi8_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi8_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi8_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %xmm2 # sched: [6:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm3, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} = xmm2[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer @@ -3374,13 +5286,21 @@ define <16 x i8> @test_masked_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %ve } define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> %mask) { -; CHECK-LABEL: test_masked_z_16xi8_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi8_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi8_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %xmm1 # sched: [6:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %xmm2, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} xmm0 {%k1} {z} = xmm1[9,6,5,15,0,0,15,2,1,3,12,14,0,6,1,4] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i8>, <16 x i8>* %vp %shuf = shufflevector <16 x i8> %vec, <16 x i8> undef, <16 x i32> %cmp = icmp eq <16 x i8> %mask, zeroinitializer @@ -3389,21 +5309,34 @@ define <16 x i8> @test_masked_z_16xi8_perm_mem_mask3(<16 x i8>* %vp, <16 x i8> % } define <32 x i8> @test_32xi8_perm_mask0(<32 x i8> %vec) { -; CHECK-LABEL: test_32xi8_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi8_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi8_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> ret <32 x i8> %res } define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_32xi8_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi8_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi8_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2 @@ -3411,25 +5344,40 @@ define <32 x i8> @test_masked_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %vec2, } define <32 x i8> @test_masked_z_32xi8_perm_mask0(<32 x i8> %vec, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_z_32xi8_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi8_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi8_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[8,0,1,15,3,5,11,13,14,2,10,15,0,10,13,5,20,25,23,18,23,22,25,24,20,21,29,20,24,16,27,21] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer ret <32 x i8> %res } define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_32xi8_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi8_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi8_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2 @@ -3437,25 +5385,40 @@ define <32 x i8> @test_masked_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %vec2, } define <32 x i8> @test_masked_z_32xi8_perm_mask1(<32 x i8> %vec, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_z_32xi8_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi8_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi8_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[0,4,3,15,5,4,5,15,10,9,11,6,6,10,0,3,21,19,26,22,30,25,22,22,27,22,26,16,23,20,18,24] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer ret <32 x i8> %res } define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_32xi8_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi8_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi8_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2 @@ -3463,33 +5426,53 @@ define <32 x i8> @test_masked_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %vec2, } define <32 x i8> @test_masked_z_32xi8_perm_mask2(<32 x i8> %vec, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_z_32xi8_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi8_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi8_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[7,8,12,14,7,4,7,12,14,12,3,15,10,1,11,15,22,26,21,19,27,16,29,24,17,17,26,29,20,31,17,29] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer ret <32 x i8> %res } define <32 x i8> @test_32xi8_perm_mask3(<32 x i8> %vec) { -; CHECK-LABEL: test_32xi8_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi8_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi8_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> ret <32 x i8> %res } define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_32xi8_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi8_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi8_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> %vec2 @@ -3497,35 +5480,56 @@ define <32 x i8> @test_masked_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %vec2, } define <32 x i8> @test_masked_z_32xi8_perm_mask3(<32 x i8> %vec, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_z_32xi8_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi8_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi8_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm0[6,1,4,7,12,13,2,8,10,5,13,4,0,0,10,8,31,31,30,16,27,27,26,27,30,26,21,24,19,25,16,18] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i8> %shuf, <32 x i8> zeroinitializer ret <32 x i8> %res } define <32 x i8> @test_32xi8_perm_mem_mask0(<32 x i8>* %vp) { -; CHECK-LABEL: test_32xi8_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi8_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi8_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> ret <32 x i8> %res } define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_32xi8_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi8_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer @@ -3534,13 +5538,21 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %ve } define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[9,0,2,15,4,6,8,4,7,3,0,2,8,1,6,5,22,17,30,23,29,31,21,23,27,22,20,27,30,30,26,22] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer @@ -3549,13 +5561,21 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask0(<32 x i8>* %vp, <32 x i8> % } define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_32xi8_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi8_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer @@ -3564,13 +5584,21 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %ve } define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[15,10,1,1,11,0,0,6,8,7,7,9,10,6,5,15,20,28,22,21,17,29,27,30,23,26,17,22,19,16,31,19] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer @@ -3579,13 +5607,21 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask1(<32 x i8>* %vp, <32 x i8> % } define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_32xi8_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi8_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer @@ -3594,13 +5630,21 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %ve } define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[2,3,6,8,2,15,15,2,6,10,14,7,14,5,7,7,26,19,25,19,21,31,30,29,16,18,20,28,29,25,27,28] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer @@ -3609,23 +5653,37 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask2(<32 x i8>* %vp, <32 x i8> % } define <32 x i8> @test_32xi8_perm_mem_mask3(<32 x i8>* %vp) { -; CHECK-LABEL: test_32xi8_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi8_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi8_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %ymm0 # sched: [7:0.50] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp %res = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> ret <32 x i8> %res } define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %vec2, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_32xi8_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi8_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi8_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %ymm2 # sched: [7:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm3, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} = ymm2[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer @@ -3634,13 +5692,21 @@ define <32 x i8> @test_masked_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %ve } define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> %mask) { -; CHECK-LABEL: test_masked_z_32xi8_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi8_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi8_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa (%rdi), %ymm1 # sched: [7:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %ymm2, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} ymm0 {%k1} {z} = ymm1[1,1,13,0,3,0,0,13,5,2,2,10,15,8,14,8,25,26,28,28,31,27,30,19,24,25,29,23,28,22,25,29] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i8>, <32 x i8>* %vp %shuf = shufflevector <32 x i8> %vec, <32 x i8> undef, <32 x i32> %cmp = icmp eq <32 x i8> %mask, zeroinitializer @@ -3649,21 +5715,34 @@ define <32 x i8> @test_masked_z_32xi8_perm_mem_mask3(<32 x i8>* %vp, <32 x i8> % } define <64 x i8> @test_64xi8_perm_mask0(<64 x i8> %vec) { -; CHECK-LABEL: test_64xi8_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_64xi8_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_64xi8_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> ret <64 x i8> %res } define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_64xi8_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_64xi8_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_64xi8_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2 @@ -3671,25 +5750,40 @@ define <64 x i8> @test_masked_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %vec2, } define <64 x i8> @test_masked_z_64xi8_perm_mask0(<64 x i8> %vec, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_z_64xi8_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_64xi8_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_64xi8_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[8,4,1,13,15,4,6,12,0,10,2,4,13,0,0,6,23,29,27,26,18,31,22,25,22,16,23,18,16,25,26,17,40,37,38,44,39,46,41,39,42,37,33,42,41,44,34,46,60,62,61,58,60,56,60,51,60,55,60,55,60,49,48,62] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer ret <64 x i8> %res } define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_64xi8_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_64xi8_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_64xi8_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2 @@ -3697,25 +5791,40 @@ define <64 x i8> @test_masked_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %vec2, } define <64 x i8> @test_masked_z_64xi8_perm_mask1(<64 x i8> %vec, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_z_64xi8_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_64xi8_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_64xi8_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[7,14,15,10,9,3,1,13,14,12,11,6,4,1,6,9,30,30,22,17,28,27,16,23,26,16,30,31,27,17,17,21,32,37,32,47,45,33,46,35,35,42,47,33,32,37,32,41,61,50,49,53,63,50,63,53,55,52,62,63,58,50,63,49] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer ret <64 x i8> %res } define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_64xi8_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_64xi8_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_64xi8_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2 @@ -3723,33 +5832,53 @@ define <64 x i8> @test_masked_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %vec2, } define <64 x i8> @test_masked_z_64xi8_perm_mask2(<64 x i8> %vec, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_z_64xi8_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_64xi8_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_64xi8_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[9,2,14,15,12,5,3,12,4,6,0,2,0,1,1,6,24,27,18,22,26,17,23,21,31,16,22,22,27,21,19,20,39,47,44,36,40,43,44,39,38,44,38,35,39,46,34,39,58,55,51,48,59,57,48,52,60,58,56,50,59,55,58,60] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer ret <64 x i8> %res } define <64 x i8> @test_64xi8_perm_mask3(<64 x i8> %vec) { -; CHECK-LABEL: test_64xi8_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_64xi8_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_64xi8_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> ret <64 x i8> %res } define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_64xi8_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_64xi8_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_64xi8_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm1 {%k1} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> %vec2 @@ -3757,35 +5886,56 @@ define <64 x i8> @test_masked_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %vec2, } define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_z_64xi8_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_64xi8_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_64xi8_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm0[3,12,4,15,1,14,0,4,8,9,6,1,4,4,12,14,25,16,28,20,21,24,19,30,18,22,20,24,25,26,24,22,42,38,44,44,36,37,42,34,43,38,41,34,42,37,39,38,55,59,53,58,48,52,59,48,57,48,55,62,48,56,49,61] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer %res = select <64 x i1> %cmp, <64 x i8> %shuf, <64 x i8> zeroinitializer ret <64 x i8> %res } define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) { -; CHECK-LABEL: test_64xi8_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_64xi8_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [4:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_64xi8_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> ret <64 x i8> %res } define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_64xi8_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_64xi8_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer @@ -3794,13 +5944,21 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %ve } define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer @@ -3809,13 +5967,21 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> % } define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_64xi8_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_64xi8_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer @@ -3824,13 +5990,21 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %ve } define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer @@ -3839,13 +6013,21 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> % } define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_64xi8_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_64xi8_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer @@ -3854,13 +6036,21 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %ve } define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer @@ -3869,23 +6059,37 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> % } define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) { -; CHECK-LABEL: test_64xi8_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_64xi8_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [4:0.50] +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_64xi8_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [8:0.50] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp %res = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> ret <64 x i8> %res } define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_64xi8_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_64xi8_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [8:0.50] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer @@ -3894,13 +6098,21 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %ve } define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %mask) { -; CHECK-LABEL: test_masked_z_64xi8_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [4:0.50] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_64xi8_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [8:0.50] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <64 x i8>, <64 x i8>* %vp %shuf = shufflevector <64 x i8> %vec, <64 x i8> undef, <64 x i32> %cmp = icmp eq <64 x i8> %mask, zeroinitializer @@ -3909,21 +6121,34 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> % } define <8 x i16> @test_8xi16_perm_high_mask0(<8 x i16> %vec) { -; CHECK-LABEL: test_8xi16_perm_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi16_perm_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi16_perm_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> ret <8 x i16> %res } define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 @@ -3931,25 +6156,40 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %v } define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 @@ -3957,25 +6197,40 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %ve } define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 @@ -3983,33 +6238,53 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %v } define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_8xi16_perm_low_mask3(<8 x i16> %vec) { -; CHECK-LABEL: test_8xi16_perm_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi16_perm_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi16_perm_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> ret <8 x i16> %res } define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 @@ -4017,25 +6292,40 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %ve } define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_high_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_high_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_high_mask4: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 @@ -4043,25 +6333,40 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %v } define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_high_mask4: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_low_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_low_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_low_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 @@ -4069,33 +6374,53 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %ve } define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_low_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_8xi16_perm_high_mask6(<8 x i16> %vec) { -; CHECK-LABEL: test_8xi16_perm_high_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi16_perm_high_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi16_perm_high_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> ret <8 x i16> %res } define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_high_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_high_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_high_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 @@ -4103,25 +6428,40 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %v } define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_high_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_high_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_low_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_low_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_low_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> %vec2 @@ -4129,33 +6469,52 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %ve } define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_low_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_low_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i16> %shuf, <8 x i16> zeroinitializer ret <8 x i16> %res } define <8 x i16> @test_8xi16_perm_high_mem_mask0(<8 x i16>* %vp) { -; CHECK-LABEL: test_8xi16_perm_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi16_perm_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi16_perm_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,7,4,6] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> ret <8 x i16> %res } define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4164,12 +6523,19 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16 } define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4178,12 +6544,19 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i } define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4192,12 +6565,19 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> } define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4206,12 +6586,19 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i1 } define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4220,12 +6607,19 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16 } define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4234,21 +6628,33 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i } define <8 x i16> @test_8xi16_perm_low_mem_mask3(<8 x i16>* %vp) { -; CHECK-LABEL: test_8xi16_perm_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi16_perm_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi16_perm_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[3,1,2,0,4,5,6,7] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> ret <8 x i16> %res } define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4257,12 +6663,19 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> } define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4271,12 +6684,19 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i1 } define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask4: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4285,12 +6705,19 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16 } define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask4: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4299,12 +6726,19 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i } define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4313,12 +6747,19 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> } define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4327,21 +6768,33 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i1 } define <8 x i16> @test_8xi16_perm_high_mem_mask6(<8 x i16>* %vp) { -; CHECK-LABEL: test_8xi16_perm_high_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi16_perm_high_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi16_perm_high_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,7,4,4,4] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %res = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> ret <8 x i16> %res } define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_high_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4350,12 +6803,19 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16 } define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_high_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4364,12 +6824,19 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i } define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %vec2, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_8xi16_perm_low_mem_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4378,12 +6845,19 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> } define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> %mask) { -; CHECK-LABEL: test_masked_z_8xi16_perm_low_mem_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i16>, <8 x i16>* %vp %shuf = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> %cmp = icmp eq <8 x i16> %mask, zeroinitializer @@ -4392,21 +6866,34 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i1 } define <16 x i16> @test_16xi16_perm_high_mask0(<16 x i16> %vec) { -; CHECK-LABEL: test_16xi16_perm_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi16_perm_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi16_perm_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 @@ -4414,25 +6901,40 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16 } define <16 x i16> @test_masked_z_16xi16_perm_high_mask0(<16 x i16> %vec, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,4,4,6,4,8,9,10,11,12,12,14,12] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 @@ -4440,25 +6942,40 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> } define <16 x i16> @test_masked_z_16xi16_perm_low_mask1(<16 x i16> %vec, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,2,3,2,4,5,6,7,8,10,11,10,12,13,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 @@ -4466,33 +6983,53 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16 } define <16 x i16> @test_masked_z_16xi16_perm_high_mask2(<16 x i16> %vec, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,7,5,5,5,8,9,10,11,15,13,13,13] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_16xi16_perm_low_mask3(<16 x i16> %vec) { -; CHECK-LABEL: test_16xi16_perm_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi16_perm_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi16_perm_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 @@ -4500,25 +7037,40 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> } define <16 x i16> @test_masked_z_16xi16_perm_low_mask3(<16 x i16> %vec, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,3,2,4,5,6,7,11,10,11,10,12,13,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_high_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_high_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_high_mask4: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 @@ -4526,25 +7078,40 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16 } define <16 x i16> @test_masked_z_16xi16_perm_high_mask4(<16 x i16> %vec, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_high_mask4: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,4,7,8,9,10,11,14,15,12,15] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_low_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_low_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_low_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 @@ -4552,33 +7119,53 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> } define <16 x i16> @test_masked_z_16xi16_perm_low_mask5(<16 x i16> %vec, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_low_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,3,3,0,4,5,6,7,11,11,11,8,12,13,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_16xi16_perm_high_mask6(<16 x i16> %vec) { -; CHECK-LABEL: test_16xi16_perm_high_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi16_perm_high_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi16_perm_high_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_high_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_high_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_high_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm1 {%k1} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 @@ -4586,25 +7173,40 @@ define <16 x i16> @test_masked_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16 } define <16 x i16> @test_masked_z_16xi16_perm_high_mask6(<16 x i16> %vec, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_high_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_high_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = ymm0[0,1,2,3,6,7,6,5,8,9,10,11,14,15,14,13] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_low_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_low_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_low_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm1 {%k1} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> %vec2 @@ -4612,33 +7214,52 @@ define <16 x i16> @test_masked_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> } define <16 x i16> @test_masked_z_16xi16_perm_low_mask7(<16 x i16> %vec, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_low_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_low_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = ymm0[3,2,1,2,4,5,6,7,11,10,9,10,12,13,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i16> %shuf, <16 x i16> zeroinitializer ret <16 x i16> %res } define <16 x i16> @test_16xi16_perm_high_mem_mask0(<16 x i16>* %vp) { -; CHECK-LABEL: test_16xi16_perm_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi16_perm_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi16_perm_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4647,12 +7268,19 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x } define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,4,7,8,9,10,11,13,14,12,15] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4661,12 +7289,19 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask0(<16 x i16>* %vp, <16 } define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4675,12 +7310,19 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x } define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,0,4,5,6,7,9,11,11,8,12,13,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4689,12 +7331,19 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask1(<16 x i16>* %vp, <16 } define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4703,12 +7352,19 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x } define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,5,6,5,6,8,9,10,11,13,14,13,14] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4717,21 +7373,33 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask2(<16 x i16>* %vp, <16 } define <16 x i16> @test_16xi16_perm_low_mem_mask3(<16 x i16>* %vp) { -; CHECK-LABEL: test_16xi16_perm_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi16_perm_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi16_perm_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4740,12 +7408,19 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x } define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,0,4,5,6,7,11,10,11,8,12,13,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4754,12 +7429,19 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask3(<16 x i16>* %vp, <16 } define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask4: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4768,12 +7450,19 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x } define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask4: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,7,7,6,7,8,9,10,11,15,15,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4782,12 +7471,19 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask4(<16 x i16>* %vp, <16 } define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4796,12 +7492,19 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x } define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7,9,11,11,10,12,13,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4810,21 +7513,33 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask5(<16 x i16>* %vp, <16 } define <16 x i16> @test_16xi16_perm_high_mem_mask6(<16 x i16>* %vp) { -; CHECK-LABEL: test_16xi16_perm_high_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi16_perm_high_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [5:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi16_perm_high_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %res = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> ret <16 x i16> %res } define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_high_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_high_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_high_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4833,12 +7548,19 @@ define <16 x i16> @test_masked_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x } define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_high_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_high_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_high_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,4,4,4,5,8,9,10,11,12,12,12,13] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4847,12 +7569,19 @@ define <16 x i16> @test_masked_z_16xi16_perm_high_mem_mask6(<16 x i16>* %vp, <16 } define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %vec2, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_16xi16_perm_low_mem_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_16xi16_perm_low_mem_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_16xi16_perm_low_mem_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4861,12 +7590,19 @@ define <16 x i16> @test_masked_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x } define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 x i16> %mask) { -; CHECK-LABEL: test_masked_z_16xi16_perm_low_mem_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_16xi16_perm_low_mem_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_16xi16_perm_low_mem_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} ymm0 {%k1} {z} = mem[3,1,3,2,4,5,6,7,11,9,11,10,12,13,14,15] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i16>, <16 x i16>* %vp %shuf = shufflevector <16 x i16> %vec, <16 x i16> undef, <16 x i32> %cmp = icmp eq <16 x i16> %mask, zeroinitializer @@ -4875,21 +7611,34 @@ define <16 x i16> @test_masked_z_16xi16_perm_low_mem_mask7(<16 x i16>* %vp, <16 } define <32 x i16> @test_32xi16_perm_high_mask0(<32 x i16> %vec) { -; CHECK-LABEL: test_32xi16_perm_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi16_perm_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi16_perm_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2 @@ -4897,25 +7646,40 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16 } define <32 x i16> @test_masked_z_32xi16_perm_high_mask0(<32 x i16> %vec, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12,16,17,18,19,20,21,22,20,24,25,26,27,28,29,30,28] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2 @@ -4923,25 +7687,40 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> } define <32 x i16> @test_masked_z_32xi16_perm_low_mask1(<32 x i16> %vec, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,0,0,4,5,6,7,10,9,8,8,12,13,14,15,18,17,16,16,20,21,22,23,26,25,24,24,28,29,30,31] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2 @@ -4949,33 +7728,53 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16 } define <32 x i16> @test_masked_z_32xi16_perm_high_mask2(<32 x i16> %vec, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,6,4,7,8,9,10,11,12,14,12,15,16,17,18,19,20,22,20,23,24,25,26,27,28,30,28,31] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer ret <32 x i16> %res } define <32 x i16> @test_32xi16_perm_low_mask3(<32 x i16> %vec) { -; CHECK-LABEL: test_32xi16_perm_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi16_perm_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi16_perm_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2 @@ -4983,25 +7782,40 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> } define <32 x i16> @test_masked_z_32xi16_perm_low_mask3(<32 x i16> %vec, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,3,1,3,4,5,6,7,11,11,9,11,12,13,14,15,19,19,17,19,20,21,22,23,27,27,25,27,28,29,30,31] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_high_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_high_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_high_mask4: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2 @@ -5009,25 +7823,40 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16 } define <32 x i16> @test_masked_z_32xi16_perm_high_mask4(<32 x i16> %vec, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_high_mask4: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,7,7,5,6,8,9,10,11,15,15,13,14,16,17,18,19,23,23,21,22,24,25,26,27,31,31,29,30] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_low_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_low_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_low_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2 @@ -5035,33 +7864,53 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> } define <32 x i16> @test_masked_z_32xi16_perm_low_mask5(<32 x i16> %vec, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_low_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[2,1,1,0,4,5,6,7,10,9,9,8,12,13,14,15,18,17,17,16,20,21,22,23,26,25,25,24,28,29,30,31] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer ret <32 x i16> %res } define <32 x i16> @test_32xi16_perm_high_mask6(<32 x i16> %vec) { -; CHECK-LABEL: test_32xi16_perm_high_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi16_perm_high_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi16_perm_high_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_high_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_high_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_high_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2 @@ -5069,25 +7918,40 @@ define <32 x i16> @test_masked_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16 } define <32 x i16> @test_masked_z_32xi16_perm_high_mask6(<32 x i16> %vec, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_high_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_high_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,4,4,5,6,8,9,10,11,12,12,13,14,16,17,18,19,20,20,21,22,24,25,26,27,28,28,29,30] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_low_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_low_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_low_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm1 {%k1} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> %vec2 @@ -5095,33 +7959,52 @@ define <32 x i16> @test_masked_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> } define <32 x i16> @test_masked_z_32xi16_perm_low_mask7(<32 x i16> %vec, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_low_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_low_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = zmm0[3,0,3,0,4,5,6,7,11,8,11,8,12,13,14,15,19,16,19,16,20,21,22,23,27,24,27,24,28,29,30,31] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer %res = select <32 x i1> %cmp, <32 x i16> %shuf, <32 x i16> zeroinitializer ret <32 x i16> %res } define <32 x i16> @test_32xi16_perm_high_mem_mask0(<32 x i16>* %vp) { -; CHECK-LABEL: test_32xi16_perm_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi16_perm_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi16_perm_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5130,12 +8013,19 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x } define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,5,6,8,9,10,11,15,12,13,14,16,17,18,19,23,20,21,22,24,25,26,27,31,28,29,30] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5144,12 +8034,19 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask0(<32 x i16>* %vp, <32 } define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5158,12 +8055,19 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x } define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[1,1,3,3,4,5,6,7,9,9,11,11,12,13,14,15,17,17,19,19,20,21,22,23,25,25,27,27,28,29,30,31] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5172,12 +8076,19 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask1(<32 x i16>* %vp, <32 } define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5186,12 +8097,19 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x } define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,4,7,6,4,8,9,10,11,12,15,14,12,16,17,18,19,20,23,22,20,24,25,26,27,28,31,30,28] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5200,21 +8118,33 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask2(<32 x i16>* %vp, <32 } define <32 x i16> @test_32xi16_perm_low_mem_mask3(<32 x i16>* %vp) { -; CHECK-LABEL: test_32xi16_perm_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi16_perm_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi16_perm_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5223,12 +8153,19 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x } define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[2,2,0,3,4,5,6,7,10,10,8,11,12,13,14,15,18,18,16,19,20,21,22,23,26,26,24,27,28,29,30,31] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5237,12 +8174,19 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask3(<32 x i16>* %vp, <32 } define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask4: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5251,12 +8195,19 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x } define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask4: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask4: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask4: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,7,4,6,5,8,9,10,11,15,12,14,13,16,17,18,19,23,20,22,21,24,25,26,27,31,28,30,29] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5265,13 +8216,21 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask4(<32 x i16>* %vp, <32 } define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 +; GENERIC-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpshufd {{.*#+}} zmm2 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm3, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqu16 %zmm2, %zmm0 {%k1} +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5280,13 +8239,21 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x } define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask5: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask5: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 +; GENERIC-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask5: +; SKX: # BB#0: +; SKX-NEXT: vpshufd {{.*#+}} zmm1 = mem[0,0,2,3,4,4,6,7,8,8,10,11,12,12,14,15] sched: [8:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5295,21 +8262,33 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask5(<32 x i16>* %vp, <32 } define <32 x i16> @test_32xi16_perm_high_mem_mask6(<32 x i16>* %vp) { -; CHECK-LABEL: test_32xi16_perm_high_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_32xi16_perm_high_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_32xi16_perm_high_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %res = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> ret <32 x i16> %res } define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_high_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_high_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_high_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5318,12 +8297,19 @@ define <32 x i16> @test_masked_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x } define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_high_mem_mask6: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_high_mem_mask6: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_high_mem_mask6: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,6,5,6,6,8,9,10,11,14,13,14,14,16,17,18,19,22,21,22,22,24,25,26,27,30,29,30,30] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5332,12 +8318,19 @@ define <32 x i16> @test_masked_z_32xi16_perm_high_mem_mask6(<32 x i16>* %vp, <32 } define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_32xi16_perm_low_mem_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_32xi16_perm_low_mem_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_32xi16_perm_low_mem_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5346,12 +8339,19 @@ define <32 x i16> @test_masked_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x } define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 x i16> %mask) { -; CHECK-LABEL: test_masked_z_32xi16_perm_low_mem_mask7: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_32xi16_perm_low_mem_mask7: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_32xi16_perm_low_mem_mask7: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 {%k1} {z} = mem[3,1,3,0,4,5,6,7,11,9,11,8,12,13,14,15,19,17,19,16,20,21,22,23,27,25,27,24,28,29,30,31] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <32 x i16>, <32 x i16>* %vp %shuf = shufflevector <32 x i16> %vec, <32 x i16> undef, <32 x i32> %cmp = icmp eq <32 x i16> %mask, zeroinitializer @@ -5360,21 +8360,34 @@ define <32 x i16> @test_masked_z_32xi16_perm_low_mem_mask7(<32 x i16>* %vp, <32 } define <4 x i32> @test_4xi32_perm_mask0(<4 x i32> %vec) { -; CHECK-LABEL: test_4xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,3,3,0] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> ret <4 x i32> %res } define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_4xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 @@ -5382,25 +8395,40 @@ define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, } define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_z_4xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_4xi32_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi32_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi32_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 @@ -5408,25 +8436,40 @@ define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, } define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_z_4xi32_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi32_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi32_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_4xi32_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi32_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi32_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 @@ -5434,33 +8477,53 @@ define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, } define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_z_4xi32_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi32_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi32_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_4xi32_perm_mask3(<4 x i32> %vec) { -; CHECK-LABEL: test_4xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,1,0,3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> ret <4 x i32> %res } define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_4xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] +; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00] +; SKX-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> %vec2 @@ -5468,33 +8531,52 @@ define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, } define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_z_4xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i32> %shuf, <4 x i32> zeroinitializer ret <4 x i32> %res } define <4 x i32> @test_4xi32_perm_mem_mask0(<4 x i32>* %vp) { -; CHECK-LABEL: test_4xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,1,3,3] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> ret <4 x i32> %res } define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_4xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -5503,12 +8585,19 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %ve } define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -5517,12 +8606,19 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> % } define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_4xi32_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi32_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -5531,12 +8627,19 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %ve } define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -5545,12 +8648,19 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> % } define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_4xi32_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi32_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -5559,12 +8669,19 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %ve } define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -5573,21 +8690,33 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> % } define <4 x i32> @test_4xi32_perm_mem_mask3(<4 x i32>* %vp) { -; CHECK-LABEL: test_4xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} xmm0 = mem[1,0,1,0] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp %res = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> ret <4 x i32> %res } define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_4xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_4xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -5596,12 +8725,19 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %ve } define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %mask) { -; CHECK-LABEL: test_masked_z_4xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <4 x i32>, <4 x i32>* %vp %shuf = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -5610,21 +8746,34 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> % } define <8 x i32> @test2_8xi32_perm_mask0(<8 x i32> %vec) { -; CHECK-LABEL: test2_8xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res } define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_8xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_8xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_8xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 @@ -5632,25 +8781,40 @@ define <8 x i32> @test2_masked_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %vec2, } define <8 x i32> @test2_masked_z_8xi32_perm_mask0(<8 x i32> %vec, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_8xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_8xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,1,0,6,7,5,4] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_8xi32_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_8xi32_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_8xi32_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 @@ -5658,25 +8822,40 @@ define <8 x i32> @test2_masked_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %vec2, } define <8 x i32> @test2_masked_z_8xi32_perm_mask1(<8 x i32> %vec, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_8xi32_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_8xi32_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[0,3,3,3,4,7,7,7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_8xi32_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_8xi32_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_8xi32_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 @@ -5684,33 +8863,53 @@ define <8 x i32> @test2_masked_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %vec2, } define <8 x i32> @test2_masked_z_8xi32_perm_mask2(<8 x i32> %vec, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_8xi32_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_8xi32_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,0,3,5,6,4,7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test2_8xi32_perm_mask3(<8 x i32> %vec) { -; CHECK-LABEL: test2_8xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res } define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_8xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] -; CHECK-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_8xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] +; GENERIC-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.50] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_8xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm1 {%k1} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] +; SKX-NEXT: vmovdqa %ymm1, %ymm0 # sched: [1:0.25] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec2 @@ -5718,33 +8917,52 @@ define <8 x i32> @test2_masked_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %vec2, } define <8 x i32> @test2_masked_z_8xi32_perm_mask3(<8 x i32> %vec, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_8xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_8xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_8xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,3,1,0,5,7,5,4] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test2_8xi32_perm_mem_mask0(<8 x i32>* %vp) { -; CHECK-LABEL: test2_8xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} ymm0 = mem[1,0,2,0,5,4,6,4] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res } define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_8xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -5753,12 +8971,19 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %v } define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[1,0,2,0,5,4,6,4] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -5767,12 +8992,19 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask0(<8 x i32>* %vp, <8 x i32> } define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_8xi32_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -5781,12 +9013,19 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %v } define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[0,3,2,0,4,7,6,4] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -5795,12 +9034,19 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask1(<8 x i32>* %vp, <8 x i32> } define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_8xi32_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -5809,12 +9055,19 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %v } define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,3,1,7,6,7,5] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -5823,21 +9076,33 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask2(<8 x i32>* %vp, <8 x i32> } define <8 x i32> @test2_8xi32_perm_mem_mask3(<8 x i32>* %vp) { -; CHECK-LABEL: test2_8xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} ymm0 = mem[3,2,0,0,7,6,4,4] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %res = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> ret <8 x i32> %res } define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_8xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_8xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_8xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -5846,12 +9111,19 @@ define <8 x i32> @test2_masked_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %v } define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_8xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_8xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_8xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} ymm0 {%k1} {z} = mem[3,2,0,0,7,6,4,4] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <8 x i32>, <8 x i32>* %vp %shuf = shufflevector <8 x i32> %vec, <8 x i32> undef, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -5860,21 +9132,34 @@ define <8 x i32> @test2_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> } define <16 x i32> @test2_16xi32_perm_mask0(<16 x i32> %vec) { -; CHECK-LABEL: test2_16xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_16xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_16xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res } define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_16xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_16xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_16xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2 @@ -5882,25 +9167,40 @@ define <16 x i32> @test2_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %v } define <16 x i32> @test2_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_16xi32_perm_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_16xi32_perm_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,1,3,0,7,5,7,4,11,9,11,8,15,13,15,12] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer ret <16 x i32> %res } define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_16xi32_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_16xi32_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_16xi32_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2 @@ -5908,25 +9208,40 @@ define <16 x i32> @test2_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %v } define <16 x i32> @test2_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_16xi32_perm_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_16xi32_perm_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,0,3,0,6,4,7,4,10,8,11,8,14,12,15,12] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer ret <16 x i32> %res } define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_16xi32_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_16xi32_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_16xi32_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2 @@ -5934,33 +9249,53 @@ define <16 x i32> @test2_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %v } define <16 x i32> @test2_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_16xi32_perm_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_16xi32_perm_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[1,3,3,0,5,7,7,4,9,11,11,8,13,15,15,12] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer ret <16 x i32> %res } define <16 x i32> @test2_16xi32_perm_mask3(<16 x i32> %vec) { -; CHECK-LABEL: test2_16xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_16xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_16xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res } define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_16xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_16xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_16xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm1 {%k1} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec2 @@ -5968,33 +9303,52 @@ define <16 x i32> @test2_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %v } define <16 x i32> @test2_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_16xi32_perm_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_16xi32_perm_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_16xi32_perm_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,0,3,7,6,4,7,11,10,8,11,15,14,12,15] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer ret <16 x i32> %res } define <16 x i32> @test2_16xi32_perm_mem_mask0(<16 x i32>* %vp) { -; CHECK-LABEL: test2_16xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_16xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_16xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} zmm0 = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res } define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_16xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_16xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6003,12 +9357,19 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32 } define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_16xi32_perm_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,1,3,5,4,5,7,9,8,9,11,13,12,13,15] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6017,12 +9378,19 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i } define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_16xi32_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_16xi32_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6031,12 +9399,19 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32 } define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_16xi32_perm_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[1,0,0,2,5,4,4,6,9,8,8,10,13,12,12,14] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6045,12 +9420,19 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i } define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_16xi32_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_16xi32_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6059,12 +9441,19 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32 } define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_16xi32_perm_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[2,0,1,2,6,4,5,6,10,8,9,10,14,12,13,14] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6073,21 +9462,33 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i } define <16 x i32> @test2_16xi32_perm_mem_mask3(<16 x i32>* %vp) { -; CHECK-LABEL: test2_16xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_16xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_16xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpermilps {{.*#+}} zmm0 = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> ret <16 x i32> %res } define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_16xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_16xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_16xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6096,12 +9497,19 @@ define <16 x i32> @test2_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32 } define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) { -; CHECK-LABEL: test2_masked_z_16xi32_perm_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_masked_z_16xi32_perm_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 +; GENERIC-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_masked_z_16xi32_perm_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 # sched: [3:1.00] +; SKX-NEXT: vpshufd {{.*#+}} zmm0 {%k1} {z} = mem[3,1,1,1,7,5,5,5,11,9,9,9,15,13,13,13] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec = load <16 x i32>, <16 x i32>* %vp %shuf = shufflevector <16 x i32> %vec, <16 x i32> undef, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6110,21 +9518,34 @@ define <16 x i32> @test2_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i } define <8 x float> @test2_8xfloat_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2) { -; CHECK-LABEL: test2_8xfloat_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xfloat_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xfloat_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res } define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test2_8xfloat_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xfloat_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 @@ -6132,26 +9553,42 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask0(<8 x float> %vec1, <8 x flo } define <8 x float> @test2_8xfloat_zero_masked_shuff_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test2_8xfloat_zero_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test2_8xfloat_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xfloat_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 @@ -6159,26 +9596,42 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask1(<8 x float> %vec1, <8 x flo } define <8 x float> @test2_8xfloat_zero_masked_shuff_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test2_8xfloat_zero_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test2_8xfloat_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xfloat_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 @@ -6186,34 +9639,55 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask2(<8 x float> %vec1, <8 x flo } define <8 x float> @test2_8xfloat_zero_masked_shuff_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test2_8xfloat_zero_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xfloat_zero_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xfloat_zero_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test2_8xfloat_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2) { -; CHECK-LABEL: test2_8xfloat_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xfloat_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xfloat_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res } define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test2_8xfloat_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test2_8xfloat_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test2_8xfloat_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 @@ -6221,35 +9695,56 @@ define <8 x float> @test2_8xfloat_masked_shuff_mask3(<8 x float> %vec1, <8 x flo } define <8 x float> @test_8xfloat_zero_masked_shuff_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_8xfloat_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) { -; CHECK-LABEL: test_8xfloat_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -6258,13 +9753,21 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x } define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -6273,13 +9776,21 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask0(<8 x float> %vec1, } define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -6288,13 +9799,21 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x } define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -6303,13 +9822,21 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask1(<8 x float> %vec1, } define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -6318,13 +9845,21 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x } define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -6333,22 +9868,35 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask2(<8 x float> %vec1, } define <8 x float> @test_8xfloat_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) { -; CHECK-LABEL: test_8xfloat_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmps %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -6357,13 +9905,21 @@ define <8 x float> @test_8xfloat_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x } define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -6372,21 +9928,34 @@ define <8 x float> @test_8xfloat_zero_masked_shuff_mem_mask3(<8 x float> %vec1, } define <16 x float> @test_16xfloat_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],zmm1[2,3,6,7] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 @@ -6394,25 +9963,40 @@ define <16 x float> @test_16xfloat_masked_shuff_mask0(<16 x float> %vec1, <16 x } define <16 x float> @test_16xfloat_zero_masked_shuff_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,0,1,2,3],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 @@ -6420,25 +10004,40 @@ define <16 x float> @test_16xfloat_masked_shuff_mask1(<16 x float> %vec1, <16 x } define <16 x float> @test_16xfloat_zero_masked_shuff_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,8,9,10,11],zmm1[0,1,2,3,12,13,14,15] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 @@ -6446,33 +10045,53 @@ define <16 x float> @test_16xfloat_masked_shuff_mask2(<16 x float> %vec1, <16 x } define <16 x float> @test_16xfloat_zero_masked_shuff_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,4,5,6,7],zmm1[0,1,2,3,4,5,6,7] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2) { -; CHECK-LABEL: test_16xfloat_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7],zmm1[0,1,4,5] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 @@ -6480,34 +10099,54 @@ define <16 x float> @test_16xfloat_masked_shuff_mask3(<16 x float> %vec1, <16 x } define <16 x float> @test_16xfloat_zero_masked_shuff_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,12,13,14,15],zmm1[0,1,2,3,8,9,10,11] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) { -; CHECK-LABEL: test_16xfloat_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,4,5],mem[4,5,2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6516,12 +10155,19 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask0(<16 x float> %vec1, <1 } define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[12,13,14,15,8,9,10,11],mem[8,9,10,11,4,5,6,7] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6530,13 +10176,21 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask0(<16 x float> %vec } define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6545,12 +10199,19 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask1(<16 x float> %vec1, <1 } define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,4,5,6,7] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6559,13 +10220,21 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask1(<16 x float> %vec } define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6574,12 +10243,19 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask2(<16 x float> %vec1, <1 } define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3,0,1,2,3],mem[8,9,10,11,8,9,10,11] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6588,22 +10264,35 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask2(<16 x float> %vec } define <16 x float> @test_16xfloat_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) { -; CHECK-LABEL: test_16xfloat_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,6,7] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6612,12 +10301,19 @@ define <16 x float> @test_16xfloat_masked_shuff_mem_mask3(<16 x float> %vec1, <1 } define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -6626,21 +10322,34 @@ define <16 x float> @test_16xfloat_zero_masked_shuff_mem_mask3(<16 x float> %vec } define <4 x double> @test_4xdouble_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2) { -; CHECK-LABEL: test_4xdouble_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 @@ -6648,26 +10357,42 @@ define <4 x double> @test_4xdouble_masked_shuff_mask0(<4 x double> %vec1, <4 x d } define <4 x double> @test_4xdouble_zero_masked_shuff_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 @@ -6675,26 +10400,42 @@ define <4 x double> @test_4xdouble_masked_shuff_mask1(<4 x double> %vec1, <4 x d } define <4 x double> @test_4xdouble_zero_masked_shuff_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 @@ -6702,34 +10443,55 @@ define <4 x double> @test_4xdouble_masked_shuff_mask2(<4 x double> %vec1, <4 x d } define <4 x double> @test_4xdouble_zero_masked_shuff_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2) { -; CHECK-LABEL: test_4xdouble_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 @@ -6737,35 +10499,56 @@ define <4 x double> @test_4xdouble_masked_shuff_mask3(<4 x double> %vec1, <4 x d } define <4 x double> @test_4xdouble_zero_masked_shuff_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) { -; CHECK-LABEL: test_4xdouble_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -6774,13 +10557,21 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask0(<4 x double> %vec1, <4 } define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -6789,13 +10580,21 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask0(<4 x double> %vec } define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -6804,13 +10603,21 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask1(<4 x double> %vec1, <4 } define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -6819,13 +10626,21 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask1(<4 x double> %vec } define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -6834,13 +10649,21 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask2(<4 x double> %vec1, <4 } define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -6849,22 +10672,35 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask2(<4 x double> %vec } define <4 x double> @test_4xdouble_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) { -; CHECK-LABEL: test_4xdouble_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -6873,13 +10709,21 @@ define <4 x double> @test_4xdouble_masked_shuff_mem_mask3(<4 x double> %vec1, <4 } define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -6888,21 +10732,34 @@ define <4 x double> @test_4xdouble_zero_masked_shuff_mem_mask3(<4 x double> %vec } define <8 x double> @test_8xdouble_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2) { -; CHECK-LABEL: test_8xdouble_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 @@ -6910,25 +10767,40 @@ define <8 x double> @test_8xdouble_masked_shuff_mask0(<8 x double> %vec1, <8 x d } define <8 x double> @test_8xdouble_zero_masked_shuff_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,2,3],zmm1[6,7,0,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 @@ -6936,25 +10808,40 @@ define <8 x double> @test_8xdouble_masked_shuff_mask1(<8 x double> %vec1, <8 x d } define <8 x double> @test_8xdouble_zero_masked_shuff_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,4,5] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 @@ -6962,33 +10849,53 @@ define <8 x double> @test_8xdouble_masked_shuff_mask2(<8 x double> %vec1, <8 x d } define <8 x double> @test_8xdouble_zero_masked_shuff_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[4,5,0,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2) { -; CHECK-LABEL: test_8xdouble_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00] +; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 @@ -6996,34 +10903,54 @@ define <8 x double> @test_8xdouble_masked_shuff_mask3(<8 x double> %vec1, <8 x d } define <8 x double> @test_8xdouble_zero_masked_shuff_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,2,3] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) { -; CHECK-LABEL: test_8xdouble_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -7032,12 +10959,19 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask0(<8 x double> %vec1, <8 } define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,0,1],mem[0,1,0,1] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -7046,13 +10980,21 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask0(<8 x double> %vec } define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -7061,12 +11003,19 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask1(<8 x double> %vec1, <8 } define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,6,7],mem[0,1,2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -7075,13 +11024,21 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask1(<8 x double> %vec } define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -7090,12 +11047,19 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask2(<8 x double> %vec1, <8 } define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,2,3],mem[0,1,4,5] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -7104,22 +11068,35 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask2(<8 x double> %vec } define <8 x double> @test_8xdouble_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) { -; CHECK-LABEL: test_8xdouble_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -7128,12 +11105,19 @@ define <8 x double> @test_8xdouble_masked_shuff_mem_mask3(<8 x double> %vec1, <8 } define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshuff64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[4,5,0,1] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -7142,21 +11126,34 @@ define <8 x double> @test_8xdouble_zero_masked_shuff_mem_mask3(<8 x double> %vec } define <8 x i32> @test_8xi32_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2) { -; CHECK-LABEL: test_8xi32_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> ret <8 x i32> %res } define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3 @@ -7164,26 +11161,42 @@ define <8 x i32> @test_8xi32_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2 } define <8 x i32> @test_8xi32_zero_masked_shuff_mask0(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_zero_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3 @@ -7191,26 +11204,42 @@ define <8 x i32> @test_8xi32_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2 } define <8 x i32> @test_8xi32_zero_masked_shuff_mask1(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_zero_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3 @@ -7218,34 +11247,55 @@ define <8 x i32> @test_8xi32_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2 } define <8 x i32> @test_8xi32_zero_masked_shuff_mask2(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_zero_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_8xi32_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2) { -; CHECK-LABEL: test_8xi32_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> ret <8 x i32> %res } define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> %vec3 @@ -7253,35 +11303,56 @@ define <8 x i32> @test_8xi32_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2 } define <8 x i32> @test_8xi32_zero_masked_shuff_mask3(<8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_zero_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_zero_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i32> %shuf, <8 x i32> zeroinitializer ret <8 x i32> %res } define <8 x i32> @test_8xi32_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p) { -; CHECK-LABEL: test_8xi32_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> ret <8 x i32> %res } define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -7290,13 +11361,21 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* } define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -7305,13 +11384,21 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask0(<8 x i32> %vec1, <8 x i } define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -7320,13 +11407,21 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* } define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -7335,13 +11430,21 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask1(<8 x i32> %vec1, <8 x i } define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -7350,13 +11453,21 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* } define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -7365,22 +11476,35 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask2(<8 x i32> %vec1, <8 x i } define <8 x i32> @test_8xi32_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p) { -; CHECK-LABEL: test_8xi32_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %res = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> ret <8 x i32> %res } define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmd %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -7389,13 +11513,21 @@ define <8 x i32> @test_8xi32_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* } define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i32>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi32_zero_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i32>, <8 x i32>* %vec2p %shuf = shufflevector <8 x i32> %vec1, <8 x i32> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -7404,21 +11536,34 @@ define <8 x i32> @test_8xi32_zero_masked_shuff_mem_mask3(<8 x i32> %vec1, <8 x i } define <16 x i32> @test_16xi32_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2) { -; CHECK-LABEL: test_16xi32_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],zmm1[2,3,6,7] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> ret <16 x i32> %res } define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3 @@ -7426,25 +11571,40 @@ define <16 x i32> @test_16xi32_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> % } define <16 x i32> @test_16xi32_zero_masked_shuff_mask0(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_zero_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],zmm1[4,5,6,7,12,13,14,15] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer ret <16 x i32> %res } define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3 @@ -7452,25 +11612,40 @@ define <16 x i32> @test_16xi32_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> % } define <16 x i32> @test_16xi32_zero_masked_shuff_mask1(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_zero_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,8,9,10,11],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer ret <16 x i32> %res } define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3 @@ -7478,33 +11653,53 @@ define <16 x i32> @test_16xi32_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> % } define <16 x i32> @test_16xi32_zero_masked_shuff_mask2(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_zero_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],zmm1[0,1,2,3,0,1,2,3] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer ret <16 x i32> %res } define <16 x i32> @test_16xi32_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2) { -; CHECK-LABEL: test_16xi32_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],zmm1[4,5,2,3] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> ret <16 x i32> %res } define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm2 {%k1} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> %vec3 @@ -7512,34 +11707,54 @@ define <16 x i32> @test_16xi32_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> % } define <16 x i32> @test_16xi32_zero_masked_shuff_mask3(<16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_zero_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_zero_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,0,1,2,3],zmm1[8,9,10,11,4,5,6,7] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x i32> %shuf, <16 x i32> zeroinitializer ret <16 x i32> %res } define <16 x i32> @test_16xi32_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p) { -; CHECK-LABEL: test_16xi32_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],mem[4,5,0,1] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> ret <16 x i32> %res } define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -7548,12 +11763,19 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i3 } define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[8,9,10,11,4,5,6,7],mem[8,9,10,11,0,1,2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -7562,13 +11784,21 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask0(<16 x i32> %vec1, <16 } define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -7577,12 +11807,19 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i3 } define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[0,1,2,3,8,9,10,11] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -7591,13 +11828,21 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask1(<16 x i32> %vec1, <16 } define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -7606,12 +11851,19 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i3 } define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,8,9,10,11],mem[12,13,14,15,12,13,14,15] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -7620,22 +11872,35 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask2(<16 x i32> %vec1, <16 } define <16 x i32> @test_16xi32_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p) { -; CHECK-LABEL: test_16xi32_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[2,3,6,7] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %res = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> ret <16 x i32> %res } define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm1 {%k1} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -7644,12 +11909,19 @@ define <16 x i32> @test_16xi32_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i3 } define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 x i32>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xi32_zero_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xi32_zero_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xi32_zero_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi32x4 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,6,7,4,5,6,7],mem[4,5,6,7,12,13,14,15] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x i32>, <16 x i32>* %vec2p %shuf = shufflevector <16 x i32> %vec1, <16 x i32> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -7658,21 +11930,34 @@ define <16 x i32> @test_16xi32_zero_masked_shuff_mem_mask3(<16 x i32> %vec1, <16 } define <4 x i64> @test_4xi64_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2) { -; CHECK-LABEL: test_4xi64_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> ret <4 x i64> %res } define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3 @@ -7680,26 +11965,42 @@ define <4 x i64> @test_4xi64_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2 } define <4 x i64> @test_4xi64_zero_masked_shuff_mask0(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_zero_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3 @@ -7707,26 +12008,42 @@ define <4 x i64> @test_4xi64_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2 } define <4 x i64> @test_4xi64_zero_masked_shuff_mask1(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_zero_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3 @@ -7734,34 +12051,55 @@ define <4 x i64> @test_4xi64_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2 } define <4 x i64> @test_4xi64_zero_masked_shuff_mask2(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_zero_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[0,1] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_4xi64_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2) { -; CHECK-LABEL: test_4xi64_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> ret <4 x i64> %res } define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 +; GENERIC-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> %vec3 @@ -7769,35 +12107,56 @@ define <4 x i64> @test_4xi64_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2 } define <4 x i64> @test_4xi64_zero_masked_shuff_mask3(<4 x i64> %vec1, <4 x i64> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_zero_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [1:1.00] +; GENERIC-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 +; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_zero_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] sched: [3:1.00] +; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm1, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x i64> %shuf, <4 x i64> zeroinitializer ret <4 x i64> %res } define <4 x i64> @test_4xi64_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p) { -; CHECK-LABEL: test_4xi64_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> ret <4 x i64> %res } define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -7806,13 +12165,21 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* } define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -7821,13 +12188,21 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask0(<4 x i64> %vec1, <4 x i } define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -7836,13 +12211,21 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* } define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -7851,13 +12234,21 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask1(<4 x i64> %vec1, <4 x i } define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -7866,13 +12257,21 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* } define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[0,1] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -7881,22 +12280,35 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask2(<4 x i64> %vec1, <4 x i } define <4 x i64> @test_4xi64_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p) { -; CHECK-LABEL: test_4xi64_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %res = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> ret <4 x i64> %res } define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -7905,13 +12317,21 @@ define <4 x i64> @test_4xi64_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* } define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i64>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [5:1.00] +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xi64_zero_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],mem[2,3] sched: [10:1.00] +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z} # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x i64>, <4 x i64>* %vec2p %shuf = shufflevector <4 x i64> %vec1, <4 x i64> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -7920,21 +12340,34 @@ define <4 x i64> @test_4xi64_zero_masked_shuff_mem_mask3(<4 x i64> %vec1, <4 x i } define <8 x i64> @test_8xi64_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2) { -; CHECK-LABEL: test_8xi64_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> ret <8 x i64> %res } define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3 @@ -7942,25 +12375,40 @@ define <8 x i64> @test_8xi64_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2 } define <8 x i64> @test_8xi64_zero_masked_shuff_mask0(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_zero_masked_shuff_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,4,5],zmm1[4,5,4,5] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer ret <8 x i64> %res } define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3 @@ -7968,25 +12416,40 @@ define <8 x i64> @test_8xi64_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2 } define <8 x i64> @test_8xi64_zero_masked_shuff_mask1(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_zero_masked_shuff_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[6,7,4,5],zmm1[2,3,4,5] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer ret <8 x i64> %res } define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3 @@ -7994,33 +12457,53 @@ define <8 x i64> @test_8xi64_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2 } define <8 x i64> @test_8xi64_zero_masked_shuff_mask2(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_zero_masked_shuff_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[0,1,4,5],zmm1[0,1,0,1] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer ret <8 x i64> %res } define <8 x i64> @test_8xi64_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2) { -; CHECK-LABEL: test_8xi64_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> ret <8 x i64> %res } define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00] -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] +; GENERIC-NEXT: vmovdqa64 %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm2 {%k1} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00] +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> %vec3 @@ -8028,34 +12511,54 @@ define <8 x i64> @test_8xi64_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2 } define <8 x i64> @test_8xi64_zero_masked_shuff_mask3(<8 x i64> %vec1, <8 x i64> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_zero_masked_shuff_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_zero_masked_shuff_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,6,7],zmm1[4,5,2,3] sched: [3:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x i64> %shuf, <8 x i64> zeroinitializer ret <8 x i64> %res } define <8 x i64> @test_8xi64_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p) { -; CHECK-LABEL: test_8xi64_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> ret <8 x i64> %res } define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -8064,12 +12567,19 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* } define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,2,3],mem[4,5,2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -8078,13 +12588,21 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask0(<8 x i64> %vec1, <8 x i } define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -8093,12 +12611,19 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* } define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[0,1,0,1] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -8107,13 +12632,21 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask1(<8 x i64> %vec1, <8 x i } define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -8122,12 +12655,19 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* } define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[4,5,0,1],mem[2,3,2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -8136,22 +12676,35 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask2(<8 x i64> %vec1, <8 x i } define <8 x i64> @test_8xi64_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p) { -; CHECK-LABEL: test_8xi64_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %res = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> ret <8 x i64> %res } define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00] -; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] +; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00] +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -8160,12 +12713,19 @@ define <8 x i64> @test_8xi64_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* } define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i64>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xi64_zero_masked_shuff_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xi64_zero_masked_shuff_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xi64_zero_masked_shuff_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,0,1],mem[6,7,2,3] sched: [10:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x i64>, <8 x i64>* %vec2p %shuf = shufflevector <8 x i64> %vec1, <8 x i64> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -8174,21 +12734,34 @@ define <8 x i64> @test_8xi64_zero_masked_shuff_mem_mask3(<8 x i64> %vec1, <8 x i } define <4 x float> @test_4xfloat_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2) { -; CHECK-LABEL: test_4xfloat_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> ret <4 x float> %res } define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 @@ -8196,25 +12769,40 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask0(<4 x float> %vec1, <4 x } define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 @@ -8222,25 +12810,40 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask1(<4 x float> %vec1, <4 x } define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_low_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 @@ -8248,33 +12851,53 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask2(<4 x float> %vec1, <4 x } define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_4xfloat_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2) { -; CHECK-LABEL: test_4xfloat_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> ret <4 x float> %res } define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 @@ -8282,34 +12905,54 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mask3(<4 x float> %vec1, <4 x } define <4 x float> @test_4xfloat_zero_masked_unpack_low_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_4xfloat_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) { -; CHECK-LABEL: test_4xfloat_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> ret <4 x float> %res } define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] +; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -8318,12 +12961,19 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask0(<4 x float> %vec1, } define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -8332,13 +12982,21 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask0(<4 x float> %v } define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] +; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -8347,12 +13005,19 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask1(<4 x float> %vec1, } define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -8361,13 +13026,21 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask1(<4 x float> %v } define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] +; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -8376,12 +13049,19 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask2(<4 x float> %vec1, } define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -8390,22 +13070,35 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask2(<4 x float> %v } define <4 x float> @test_4xfloat_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) { -; CHECK-LABEL: test_4xfloat_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> ret <4 x float> %res } define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] +; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -8414,12 +13107,19 @@ define <4 x float> @test_4xfloat_masked_unpack_low_mem_mask3(<4 x float> %vec1, } define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -8428,21 +13128,34 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_low_mem_mask3(<4 x float> %v } define <8 x float> @test_8xfloat_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2) { -; CHECK-LABEL: test_8xfloat_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 @@ -8450,25 +13163,40 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask0(<8 x float> %vec1, <8 x } define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 @@ -8476,25 +13204,40 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask1(<8 x float> %vec1, <8 x } define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_low_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 @@ -8502,33 +13245,53 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask2(<8 x float> %vec1, <8 x } define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_8xfloat_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2) { -; CHECK-LABEL: test_8xfloat_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 @@ -8536,34 +13299,54 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mask3(<8 x float> %vec1, <8 x } define <8 x float> @test_8xfloat_zero_masked_unpack_low_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_8xfloat_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) { -; CHECK-LABEL: test_8xfloat_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -8572,12 +13355,19 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask0(<8 x float> %vec1, } define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -8586,13 +13376,21 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask0(<8 x float> %v } define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -8601,12 +13399,19 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask1(<8 x float> %vec1, } define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -8615,13 +13420,21 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask1(<8 x float> %v } define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -8630,12 +13443,19 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask2(<8 x float> %vec1, } define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -8644,22 +13464,35 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask2(<8 x float> %v } define <8 x float> @test_8xfloat_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) { -; CHECK-LABEL: test_8xfloat_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -8668,12 +13501,19 @@ define <8 x float> @test_8xfloat_masked_unpack_low_mem_mask3(<8 x float> %vec1, } define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -8682,21 +13522,34 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_low_mem_mask3(<8 x float> %v } define <16 x float> @test_16xfloat_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2) { -; CHECK-LABEL: test_16xfloat_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 @@ -8704,25 +13557,40 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask0(<16 x float> %vec1, < } define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 @@ -8730,25 +13598,40 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask1(<16 x float> %vec1, < } define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_low_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 @@ -8756,33 +13639,53 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask2(<16 x float> %vec1, < } define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2) { -; CHECK-LABEL: test_16xfloat_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 @@ -8790,34 +13693,54 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mask3(<16 x float> %vec1, < } define <16 x float> @test_16xfloat_zero_masked_unpack_low_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) { -; CHECK-LABEL: test_16xfloat_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -8826,12 +13749,19 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask0(<16 x float> %vec } define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -8840,13 +13770,21 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask0(<16 x float> } define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -8855,12 +13793,19 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask1(<16 x float> %vec } define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -8869,13 +13814,21 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask1(<16 x float> } define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -8884,12 +13837,19 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask2(<16 x float> %vec } define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -8898,22 +13858,35 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask2(<16 x float> } define <16 x float> @test_16xfloat_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) { -; CHECK-LABEL: test_16xfloat_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpcklps {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -8922,12 +13895,19 @@ define <16 x float> @test_16xfloat_masked_unpack_low_mem_mask3(<16 x float> %vec } define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklps {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[1],mem[1],zmm0[4],mem[4],zmm0[5],mem[5],zmm0[8],mem[8],zmm0[9],mem[9],zmm0[12],mem[12],zmm0[13],mem[13] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -8936,21 +13916,34 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_low_mem_mask3(<16 x float> } define <2 x double> @test_2xdouble_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2) { -; CHECK-LABEL: test_2xdouble_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> ret <2 x double> %res } define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_masked_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00] -; CHECK-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] +; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_masked_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00] +; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3 @@ -8958,25 +13951,40 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mask0(<2 x double> %vec1, < } define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer ret <2 x double> %res } define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_masked_unpack_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00] -; CHECK-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] +; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_masked_unpack_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} xmm2 {%k1} = xmm0[0],xmm1[0] sched: [1:1.00] +; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3 @@ -8984,34 +13992,54 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mask1(<2 x double> %vec1, < } define <2 x double> @test_2xdouble_zero_masked_unpack_low_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],xmm1[0] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer ret <2 x double> %res } define <2 x double> @test_2xdouble_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) { -; CHECK-LABEL: test_2xdouble_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> ret <2 x double> %res } define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00] -; CHECK-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00] +; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -9020,12 +14048,19 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask0(<2 x double> %vec } define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -9034,13 +14069,21 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask0(<2 x double> } define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_masked_unpack_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00] -; CHECK-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_masked_unpack_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] +; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_masked_unpack_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 {%k1} = xmm0[0],mem[0] sched: [7:1.00] +; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -9049,12 +14092,19 @@ define <2 x double> @test_2xdouble_masked_unpack_low_mem_mask1(<2 x double> %vec } define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_zero_masked_unpack_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -9063,21 +14113,34 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_low_mem_mask1(<2 x double> } define <4 x double> @test_4xdouble_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2) { -; CHECK-LABEL: test_4xdouble_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; CHECK-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 @@ -9085,25 +14148,40 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask0(<4 x double> %vec1, < } define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; CHECK-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 @@ -9111,25 +14189,40 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask1(<4 x double> %vec1, < } define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; CHECK-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_low_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 @@ -9137,33 +14230,53 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask2(<4 x double> %vec1, < } define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2) { -; CHECK-LABEL: test_4xdouble_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; CHECK-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 @@ -9171,34 +14284,54 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mask3(<4 x double> %vec1, < } define <4 x double> @test_4xdouble_zero_masked_unpack_low_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) { -; CHECK-LABEL: test_4xdouble_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -9207,12 +14340,19 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask0(<4 x double> %vec } define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -9221,13 +14361,21 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask0(<4 x double> } define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -9236,12 +14384,19 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask1(<4 x double> %vec } define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -9250,13 +14405,21 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask1(<4 x double> } define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -9265,12 +14428,19 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask2(<4 x double> %vec } define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -9279,22 +14449,35 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask2(<4 x double> } define <4 x double> @test_4xdouble_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) { -; CHECK-LABEL: test_4xdouble_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm1 {%k1} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -9303,12 +14486,19 @@ define <4 x double> @test_4xdouble_masked_unpack_low_mem_mask3(<4 x double> %vec } define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} ymm0 {%k1} {z} = ymm0[0],mem[0],ymm0[2],mem[2] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -9317,21 +14507,34 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_low_mem_mask3(<4 x double> } define <8 x double> @test_8xdouble_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2) { -; CHECK-LABEL: test_8xdouble_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] +; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 @@ -9339,25 +14542,40 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask0(<8 x double> %vec1, < } define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] +; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 @@ -9365,25 +14583,40 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask1(<8 x double> %vec1, < } define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_low_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] +; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 @@ -9391,33 +14624,53 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask2(<8 x double> %vec1, < } define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2) { -; CHECK-LABEL: test_8xdouble_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] +; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 @@ -9425,34 +14678,54 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mask3(<8 x double> %vec1, < } define <8 x double> @test_8xdouble_zero_masked_unpack_low_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) { -; CHECK-LABEL: test_8xdouble_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -9461,12 +14734,19 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask0(<8 x double> %vec } define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -9475,13 +14755,21 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask0(<8 x double> } define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -9490,12 +14778,19 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask1(<8 x double> %vec } define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -9504,13 +14799,21 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask1(<8 x double> } define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -9519,12 +14822,19 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask2(<8 x double> %vec } define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -9533,22 +14843,35 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask2(<8 x double> } define <8 x double> @test_8xdouble_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) { -; CHECK-LABEL: test_8xdouble_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm1 {%k1} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -9557,12 +14880,19 @@ define <8 x double> @test_8xdouble_masked_unpack_low_mem_mask3(<8 x double> %vec } define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_low_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],mem[0],zmm0[2],mem[2],zmm0[4],mem[4],zmm0[6],mem[6] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -9571,21 +14901,34 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_low_mem_mask3(<8 x double> } define <4 x float> @test_4xfloat_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2) { -; CHECK-LABEL: test_4xfloat_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> ret <4 x float> %res } define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 @@ -9593,25 +14936,40 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask0(<4 x float> %vec1, <4 } define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask0(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_high_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 @@ -9619,25 +14977,40 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask1(<4 x float> %vec1, <4 } define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask1(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 @@ -9645,33 +15018,53 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask2(<4 x float> %vec1, <4 } define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask2(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_4xfloat_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2) { -; CHECK-LABEL: test_4xfloat_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> ret <4 x float> %res } define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; CHECK-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SKX-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> %vec3 @@ -9679,34 +15072,54 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mask3(<4 x float> %vec1, <4 } define <4 x float> @test_4xfloat_zero_masked_unpack_high_mask3(<4 x float> %vec1, <4 x float> %vec2, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x float> %shuf, <4 x float> zeroinitializer ret <4 x float> %res } define <4 x float> @test_4xfloat_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p) { -; CHECK-LABEL: test_4xfloat_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> ret <4 x float> %res } define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] +; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -9715,12 +15128,19 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask0(<4 x float> %vec1, } define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -9729,13 +15149,21 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask0(<4 x float> % } define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] +; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -9744,12 +15172,19 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask1(<4 x float> %vec1, } define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -9758,13 +15193,21 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask1(<4 x float> % } define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] +; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -9773,12 +15216,19 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask2(<4 x float> %vec1, } define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -9787,22 +15237,35 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask2(<4 x float> % } define <4 x float> @test_4xfloat_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p) { -; CHECK-LABEL: test_4xfloat_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %res = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> ret <4 x float> %res } define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x float> %vec3, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_masked_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] -; CHECK-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_masked_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] +; GENERIC-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_masked_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm1 {%k1} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; SKX-NEXT: vmovaps %xmm1, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -9811,12 +15274,19 @@ define <4 x float> @test_4xfloat_masked_unpack_high_mem_mask3(<4 x float> %vec1, } define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> %vec1, <4 x float>* %vec2p, <4 x i32> %mask) { -; CHECK-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xfloat_zero_masked_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} xmm0 {%k1} {z} = xmm0[2],mem[2],xmm0[3],mem[3] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x float>, <4 x float>* %vec2p %shuf = shufflevector <4 x float> %vec1, <4 x float> %vec2, <4 x i32> %cmp = icmp eq <4 x i32> %mask, zeroinitializer @@ -9825,21 +15295,34 @@ define <4 x float> @test_4xfloat_zero_masked_unpack_high_mem_mask3(<4 x float> % } define <8 x float> @test_8xfloat_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2) { -; CHECK-LABEL: test_8xfloat_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 @@ -9847,25 +15330,40 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask0(<8 x float> %vec1, <8 } define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask0(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_high_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 @@ -9873,25 +15371,40 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask1(<8 x float> %vec1, <8 } define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask1(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 @@ -9899,33 +15412,53 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask2(<8 x float> %vec1, <8 } define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask2(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_8xfloat_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2) { -; CHECK-LABEL: test_8xfloat_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] -; CHECK-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; GENERIC-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; SKX-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> %vec3 @@ -9933,34 +15466,54 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mask3(<8 x float> %vec1, <8 } define <8 x float> @test_8xfloat_zero_masked_unpack_high_mask3(<8 x float> %vec1, <8 x float> %vec2, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x float> %shuf, <8 x float> zeroinitializer ret <8 x float> %res } define <8 x float> @test_8xfloat_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p) { -; CHECK-LABEL: test_8xfloat_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9969,12 +15522,19 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask0(<8 x float> %vec1, } define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9983,13 +15543,21 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask0(<8 x float> % } define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -9998,12 +15566,19 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask1(<8 x float> %vec1, } define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -10012,13 +15587,21 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask1(<8 x float> % } define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -10027,12 +15610,19 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask2(<8 x float> %vec1, } define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -10041,22 +15631,35 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask2(<8 x float> % } define <8 x float> @test_8xfloat_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p) { -; CHECK-LABEL: test_8xfloat_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %res = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> ret <8 x float> %res } define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x float> %vec3, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_masked_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_masked_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; GENERIC-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_masked_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm1 {%k1} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: vmovaps %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -10065,12 +15668,19 @@ define <8 x float> @test_8xfloat_masked_unpack_high_mem_mask3(<8 x float> %vec1, } define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> %vec1, <8 x float>* %vec2p, <8 x i32> %mask) { -; CHECK-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xfloat_zero_masked_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} ymm0 {%k1} {z} = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x float>, <8 x float>* %vec2p %shuf = shufflevector <8 x float> %vec1, <8 x float> %vec2, <8 x i32> %cmp = icmp eq <8 x i32> %mask, zeroinitializer @@ -10079,21 +15689,34 @@ define <8 x float> @test_8xfloat_zero_masked_unpack_high_mem_mask3(<8 x float> % } define <16 x float> @test_16xfloat_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2) { -; CHECK-LABEL: test_16xfloat_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 @@ -10101,25 +15724,40 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask0(<16 x float> %vec1, } define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask0(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_high_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 @@ -10127,25 +15765,40 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask1(<16 x float> %vec1, } define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask1(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 @@ -10153,33 +15806,53 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask2(<16 x float> %vec1, } define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask2(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2) { -; CHECK-LABEL: test_16xfloat_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; GENERIC-NEXT: vmovaps %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] +; SKX-NEXT: vmovaps %zmm2, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> %vec3 @@ -10187,34 +15860,54 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mask3(<16 x float> %vec1, } define <16 x float> @test_16xfloat_zero_masked_unpack_high_mask3(<16 x float> %vec1, <16 x float> %vec2, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer %res = select <16 x i1> %cmp, <16 x float> %shuf, <16 x float> zeroinitializer ret <16 x float> %res } define <16 x float> @test_16xfloat_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p) { -; CHECK-LABEL: test_16xfloat_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -10223,12 +15916,19 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask0(<16 x float> %ve } define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -10237,13 +15937,21 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask0(<16 x float } define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -10252,12 +15960,19 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask1(<16 x float> %ve } define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -10266,13 +15981,21 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask1(<16 x float } define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -10281,12 +16004,19 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask2(<16 x float> %ve } define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -10295,22 +16025,35 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask2(<16 x float } define <16 x float> @test_16xfloat_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p) { -; CHECK-LABEL: test_16xfloat_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpckhps {{.*#+}} zmm0 = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %res = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> ret <16 x float> %res } define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x float> %vec3, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_masked_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_masked_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] +; GENERIC-NEXT: vmovaps %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_masked_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm1 {%k1} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] +; SKX-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:0.33] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -10319,12 +16062,19 @@ define <16 x float> @test_16xfloat_masked_unpack_high_mem_mask3(<16 x float> %ve } define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float> %vec1, <16 x float>* %vec2p, <16 x i32> %mask) { -; CHECK-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_16xfloat_zero_masked_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqd %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhps {{.*#+}} zmm0 {%k1} {z} = zmm0[2],mem[2],zmm0[3],mem[3],zmm0[6],mem[6],zmm0[7],mem[7],zmm0[10],mem[10],zmm0[11],mem[11],zmm0[14],mem[14],zmm0[15],mem[15] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <16 x float>, <16 x float>* %vec2p %shuf = shufflevector <16 x float> %vec1, <16 x float> %vec2, <16 x i32> %cmp = icmp eq <16 x i32> %mask, zeroinitializer @@ -10333,21 +16083,34 @@ define <16 x float> @test_16xfloat_zero_masked_unpack_high_mem_mask3(<16 x float } define <2 x double> @test_2xdouble_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2) { -; CHECK-LABEL: test_2xdouble_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> ret <2 x double> %res } define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_masked_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] +; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_masked_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3 @@ -10355,25 +16118,40 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mask0(<2 x double> %vec1, } define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask0(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer ret <2 x double> %res } define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x double> %vec3, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_masked_unpack_high_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] +; GENERIC-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_masked_unpack_high_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm4, %xmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> %vec3 @@ -10381,34 +16159,54 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mask1(<2 x double> %vec1, } define <2 x double> @test_2xdouble_zero_masked_unpack_high_mask1(<2 x double> %vec1, <2 x double> %vec2, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[1] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer %res = select <2 x i1> %cmp, <2 x double> %shuf, <2 x double> zeroinitializer ret <2 x double> %res } define <2 x double> @test_2xdouble_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p) { -; CHECK-LABEL: test_2xdouble_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [7:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p %res = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> ret <2 x double> %res } define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_masked_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] +; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -10417,12 +16215,19 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask0(<2 x double> %ve } define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -10431,13 +16236,21 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask0(<2 x double } define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x double> %vec3, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_masked_unpack_high_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_masked_unpack_high_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] +; GENERIC-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_masked_unpack_high_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm3, %xmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} xmm1 {%k1} = xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: vmovapd %xmm1, %xmm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -10446,12 +16259,19 @@ define <2 x double> @test_2xdouble_masked_unpack_high_mem_mask1(<2 x double> %ve } define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double> %vec1, <2 x double>* %vec2p, <2 x i64> %mask) { -; CHECK-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_2xdouble_zero_masked_unpack_high_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %xmm2, %xmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} xmm0 {%k1} {z} = xmm0[1],mem[1] sched: [7:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <2 x double>, <2 x double>* %vec2p %shuf = shufflevector <2 x double> %vec1, <2 x double> %vec2, <2 x i32> %cmp = icmp eq <2 x i64> %mask, zeroinitializer @@ -10460,21 +16280,34 @@ define <2 x double> @test_2xdouble_zero_masked_unpack_high_mem_mask1(<2 x double } define <4 x double> @test_4xdouble_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2) { -; CHECK-LABEL: test_4xdouble_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] -; CHECK-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 @@ -10482,25 +16315,40 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask0(<4 x double> %vec1, } define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask0(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] -; CHECK-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_high_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 @@ -10508,25 +16356,40 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask1(<4 x double> %vec1, } define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask1(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] -; CHECK-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 @@ -10534,33 +16397,53 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask2(<4 x double> %vec1, } define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask2(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2) { -; CHECK-LABEL: test_4xdouble_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] -; CHECK-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; GENERIC-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm4, %ymm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; SKX-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> %vec3 @@ -10568,34 +16451,54 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mask3(<4 x double> %vec1, } define <4 x double> @test_4xdouble_zero_masked_unpack_high_mask3(<4 x double> %vec1, <4 x double> %vec2, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer %res = select <4 x i1> %cmp, <4 x double> %shuf, <4 x double> zeroinitializer ret <4 x double> %res } define <4 x double> @test_4xdouble_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p) { -; CHECK-LABEL: test_4xdouble_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10604,12 +16507,19 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask0(<4 x double> %ve } define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10618,13 +16528,21 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask0(<4 x double } define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10633,12 +16551,19 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask1(<4 x double> %ve } define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10647,13 +16572,21 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask1(<4 x double } define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10662,12 +16595,19 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask2(<4 x double> %ve } define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10676,22 +16616,35 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask2(<4 x double } define <4 x double> @test_4xdouble_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p) { -; CHECK-LABEL: test_4xdouble_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %res = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> ret <4 x double> %res } define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x double> %vec3, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_masked_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] -; CHECK-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_masked_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] +; GENERIC-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_masked_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm3, %ymm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm1 {%k1} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] +; SKX-NEXT: vmovapd %ymm1, %ymm0 # sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10700,12 +16653,19 @@ define <4 x double> @test_4xdouble_masked_unpack_high_mem_mask3(<4 x double> %ve } define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double> %vec1, <4 x double>* %vec2p, <4 x i64> %mask) { -; CHECK-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_4xdouble_zero_masked_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %ymm2, %ymm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} ymm0 {%k1} {z} = ymm0[1],mem[1],ymm0[3],mem[3] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <4 x double>, <4 x double>* %vec2p %shuf = shufflevector <4 x double> %vec1, <4 x double> %vec2, <4 x i32> %cmp = icmp eq <4 x i64> %mask, zeroinitializer @@ -10714,21 +16674,34 @@ define <4 x double> @test_4xdouble_zero_masked_unpack_high_mem_mask3(<4 x double } define <8 x double> @test_8xdouble_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2) { -; CHECK-LABEL: test_8xdouble_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] +; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 @@ -10736,25 +16709,40 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask0(<8 x double> %vec1, } define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask0(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_high_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] +; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 @@ -10762,25 +16750,40 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask1(<8 x double> %vec1, } define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask1(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] +; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 @@ -10788,33 +16791,53 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask2(<8 x double> %vec1, } define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask2(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2) { -; CHECK-LABEL: test_8xdouble_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; CHECK-NEXT: vmovapd %zmm2, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; GENERIC-NEXT: vmovapd %zmm2, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm4, %zmm3, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] +; SKX-NEXT: vmovapd %zmm2, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> %vec3 @@ -10822,34 +16845,54 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mask3(<8 x double> %vec1, } define <8 x double> @test_8xdouble_zero_masked_unpack_high_mask3(<8 x double> %vec1, <8 x double> %vec2, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] sched: [1:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer %res = select <8 x i1> %cmp, <8 x double> %shuf, <8 x double> zeroinitializer ret <8 x double> %res } define <8 x double> @test_8xdouble_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p) { -; CHECK-LABEL: test_8xdouble_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -10858,12 +16901,19 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask0(<8 x double> %ve } define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask0: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -10872,13 +16922,21 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask0(<8 x double } define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -10887,12 +16945,19 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask1(<8 x double> %ve } define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask1: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -10901,13 +16966,21 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask1(<8 x double } define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -10916,12 +16989,19 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask2(<8 x double> %ve } define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask2: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -10930,22 +17010,35 @@ define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask2(<8 x double } define <8 x double> @test_8xdouble_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p) { -; CHECK-LABEL: test_8xdouble_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %res = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> ret <8 x double> %res } define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x double> %vec3, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_masked_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: vmovapd %zmm1, %zmm0 -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_masked_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] +; GENERIC-NEXT: vmovapd %zmm1, %zmm0 +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_masked_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm3, %zmm2, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm1 {%k1} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer @@ -10954,12 +17047,19 @@ define <8 x double> @test_8xdouble_masked_unpack_high_mem_mask3(<8 x double> %ve } define <8 x double> @test_8xdouble_zero_masked_unpack_high_mem_mask3(<8 x double> %vec1, <8 x double>* %vec2p, <8 x i64> %mask) { -; CHECK-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3: -; CHECK: # BB#0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] -; CHECK-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] -; CHECK-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] -; CHECK-NEXT: retq # sched: [7:1.00] +; GENERIC-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3: +; GENERIC: # BB#0: +; GENERIC-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; GENERIC-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 +; GENERIC-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] +; GENERIC-NEXT: retq # sched: [1:1.00] +; +; SKX-LABEL: test_8xdouble_zero_masked_unpack_high_mem_mask3: +; SKX: # BB#0: +; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 # sched: [1:0.33] +; SKX-NEXT: vpcmpeqq %zmm2, %zmm1, %k1 # sched: [3:1.00] +; SKX-NEXT: vunpckhpd {{.*#+}} zmm0 {%k1} {z} = zmm0[1],mem[1],zmm0[3],mem[3],zmm0[5],mem[5],zmm0[7],mem[7] sched: [8:1.00] +; SKX-NEXT: retq # sched: [7:1.00] %vec2 = load <8 x double>, <8 x double>* %vec2p %shuf = shufflevector <8 x double> %vec1, <8 x double> %vec2, <8 x i32> %cmp = icmp eq <8 x i64> %mask, zeroinitializer -- 2.40.0