From 43b90b17cdb894d8e136511e24dbdd2e57d4ef97 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 7 Aug 2019 16:33:37 +0000 Subject: [PATCH] Recommit r368079 "[X86] Remove uses of the -x86-experimental-vector-widening-legalization flag from test/CodeGen/X86/" git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@368184 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/SwizzleShuff.ll | 2 +- test/CodeGen/X86/avx512-cvt-widen.ll | 2645 -------- test/CodeGen/X86/avx512-trunc-widen.ll | 1035 --- test/CodeGen/X86/bswap-vector.ll | 72 - test/CodeGen/X86/lower-bitcast.ll | 50 - .../X86/masked_gather_scatter_widen.ll | 225 +- test/CodeGen/X86/pmulh.ll | 2 - test/CodeGen/X86/shrink_vmul-widen.ll | 2553 ------- .../CodeGen/X86/shuffle-vs-trunc-128-widen.ll | 574 -- .../CodeGen/X86/shuffle-vs-trunc-256-widen.ll | 1454 ---- .../CodeGen/X86/shuffle-vs-trunc-512-widen.ll | 903 --- test/CodeGen/X86/vec_cast2.ll | 119 - test/CodeGen/X86/vec_cast3.ll | 87 - test/CodeGen/X86/vec_fp_to_int-widen.ll | 2794 -------- test/CodeGen/X86/vec_int_to_fp-widen.ll | 6008 ----------------- test/CodeGen/X86/vector-idiv-v2i32.ll | 216 - .../{vec_clz.ll => vector-lzcnt-sub128.ll} | 2 +- test/CodeGen/X86/vector-reduce-add-widen.ll | 1386 ---- test/CodeGen/X86/vector-reduce-and-widen.ll | 1168 ---- test/CodeGen/X86/vector-reduce-mul-widen.ll | 3022 --------- test/CodeGen/X86/vector-reduce-or-widen.ll | 1168 ---- test/CodeGen/X86/vector-reduce-smax-widen.ll | 2001 ------ test/CodeGen/X86/vector-reduce-smin-widen.ll | 1999 ------ test/CodeGen/X86/vector-reduce-umax-widen.ll | 2203 ------ test/CodeGen/X86/vector-reduce-umin-widen.ll | 2007 ------ test/CodeGen/X86/vector-reduce-xor-widen.ll | 1168 ---- test/CodeGen/X86/vector-sext-widen.ll | 3966 ----------- .../X86/vector-shift-ashr-sub128-widen.ll | 2481 ------- .../X86/vector-shift-lshr-sub128-widen.ll | 2151 ------ .../X86/vector-shift-shl-sub128-widen.ll | 1940 ------ test/CodeGen/X86/vector-trunc-math-widen.ll | 5197 -------------- test/CodeGen/X86/vector-trunc-packus-widen.ll | 3079 --------- test/CodeGen/X86/vector-trunc-ssat-widen.ll | 3050 --------- test/CodeGen/X86/vector-trunc-usat-widen.ll | 2430 ------- test/CodeGen/X86/vector-trunc-widen.ll | 2126 ------ test/CodeGen/X86/vector-trunc.ll | 126 + test/CodeGen/X86/vector-zext-widen.ll | 2741 -------- test/CodeGen/X86/widen_cast-4.ll | 38 +- test/CodeGen/X86/widen_conversions.ll | 4 +- test/CodeGen/X86/widen_mul.ll | 10 +- 40 files changed, 139 insertions(+), 64063 deletions(-) delete mode 100644 test/CodeGen/X86/avx512-cvt-widen.ll delete mode 100644 test/CodeGen/X86/avx512-trunc-widen.ll delete mode 100644 test/CodeGen/X86/shrink_vmul-widen.ll delete mode 100644 test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll delete mode 100644 test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll delete mode 100644 test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll delete mode 100644 test/CodeGen/X86/vec_fp_to_int-widen.ll delete mode 100644 test/CodeGen/X86/vec_int_to_fp-widen.ll rename test/CodeGen/X86/{vec_clz.ll => vector-lzcnt-sub128.ll} (96%) delete mode 100644 test/CodeGen/X86/vector-reduce-add-widen.ll delete mode 100644 test/CodeGen/X86/vector-reduce-and-widen.ll delete mode 100644 test/CodeGen/X86/vector-reduce-mul-widen.ll delete mode 100644 test/CodeGen/X86/vector-reduce-or-widen.ll delete mode 100644 test/CodeGen/X86/vector-reduce-smax-widen.ll delete mode 100644 test/CodeGen/X86/vector-reduce-smin-widen.ll delete mode 100644 test/CodeGen/X86/vector-reduce-umax-widen.ll delete mode 100644 test/CodeGen/X86/vector-reduce-umin-widen.ll delete mode 100644 test/CodeGen/X86/vector-reduce-xor-widen.ll delete mode 100644 test/CodeGen/X86/vector-sext-widen.ll delete mode 100644 test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll delete mode 100644 test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll delete mode 100644 test/CodeGen/X86/vector-shift-shl-sub128-widen.ll delete mode 100644 test/CodeGen/X86/vector-trunc-math-widen.ll delete mode 100644 test/CodeGen/X86/vector-trunc-packus-widen.ll delete mode 100644 test/CodeGen/X86/vector-trunc-ssat-widen.ll delete mode 100644 test/CodeGen/X86/vector-trunc-usat-widen.ll delete mode 100644 test/CodeGen/X86/vector-trunc-widen.ll delete mode 100644 test/CodeGen/X86/vector-zext-widen.ll diff --git a/test/CodeGen/X86/SwizzleShuff.ll b/test/CodeGen/X86/SwizzleShuff.ll index e6519a60a4b..4019ee42679 100644 --- a/test/CodeGen/X86/SwizzleShuff.ll +++ b/test/CodeGen/X86/SwizzleShuff.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s ; Check that we perform a scalar XOR on i32. diff --git a/test/CodeGen/X86/avx512-cvt-widen.ll b/test/CodeGen/X86/avx512-cvt-widen.ll deleted file mode 100644 index eddd0039507..00000000000 --- a/test/CodeGen/X86/avx512-cvt-widen.ll +++ /dev/null @@ -1,2645 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQNOVL --check-prefix=AVX512DQ -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW - - -define <16 x float> @sitof32(<16 x i32> %a) nounwind { -; ALL-LABEL: sitof32: -; ALL: # %bb.0: -; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 -; ALL-NEXT: retq - %b = sitofp <16 x i32> %a to <16 x float> - ret <16 x float> %b -} - -define <8 x double> @sltof864(<8 x i64> %a) { -; NODQ-LABEL: sltof864: -; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: sltof864: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtqq2pd %zmm0, %zmm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: sltof864: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 -; DQNOVL-NEXT: retq - %b = sitofp <8 x i64> %a to <8 x double> - ret <8 x double> %b -} - -define <4 x double> @slto4f64(<4 x i64> %a) { -; NODQ-LABEL: slto4f64: -; NODQ: # %bb.0: -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: slto4f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: slto4f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; DQNOVL-NEXT: retq - %b = sitofp <4 x i64> %a to <4 x double> - ret <4 x double> %b -} - -define <2 x double> @slto2f64(<2 x i64> %a) { -; NODQ-LABEL: slto2f64: -; NODQ: # %bb.0: -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; NODQ-NEXT: retq -; -; VLDQ-LABEL: slto2f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: slto2f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq - %b = sitofp <2 x i64> %a to <2 x double> - ret <2 x double> %b -} - -define <2 x float> @sltof2f32(<2 x i64> %a) { -; NOVLDQ-LABEL: sltof2f32: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax -; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; NOVLDQ-NEXT: vmovq %xmm0, %rax -; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: sltof2f32: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: sltof2f32: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VLNODQ-NEXT: vmovq %xmm0, %rax -; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VLNODQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; VLNODQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: sltof2f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq - %b = sitofp <2 x i64> %a to <2 x float> - ret <2 x float>%b -} - -define <4 x float> @slto4f32_mem(<4 x i64>* %a) { -; NODQ-LABEL: slto4f32_mem: -; NODQ: # %bb.0: -; NODQ-NEXT: vmovdqu (%rdi), %xmm0 -; NODQ-NEXT: vmovdqu 16(%rdi), %xmm1 -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; NODQ-NEXT: retq -; -; VLDQ-LABEL: slto4f32_mem: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: slto4f32_mem: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vmovups (%rdi), %ymm0 -; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq - %a1 = load <4 x i64>, <4 x i64>* %a, align 8 - %b = sitofp <4 x i64> %a1 to <4 x float> - ret <4 x float>%b -} - -define <4 x i64> @f64to4sl(<4 x double> %a) { -; NODQ-LABEL: f64to4sl: -; NODQ: # %bb.0: -; NODQ-NEXT: vextractf128 $1, %ymm0, %xmm1 -; NODQ-NEXT: vcvttsd2si %xmm1, %rax -; NODQ-NEXT: vmovq %rax, %xmm2 -; NODQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; NODQ-NEXT: vcvttsd2si %xmm1, %rax -; NODQ-NEXT: vmovq %rax, %xmm1 -; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; NODQ-NEXT: vcvttsd2si %xmm0, %rax -; NODQ-NEXT: vmovq %rax, %xmm2 -; NODQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; NODQ-NEXT: vcvttsd2si %xmm0, %rax -; NODQ-NEXT: vmovq %rax, %xmm0 -; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: f64to4sl: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: f64to4sl: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; DQNOVL-NEXT: vcvttpd2qq %zmm0, %zmm0 -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; DQNOVL-NEXT: retq - %b = fptosi <4 x double> %a to <4 x i64> - ret <4 x i64> %b -} - -define <4 x i64> @f32to4sl(<4 x float> %a) { -; NODQ-LABEL: f32to4sl: -; NODQ: # %bb.0: -; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; NODQ-NEXT: vcvttss2si %xmm1, %rax -; NODQ-NEXT: vmovq %rax, %xmm1 -; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; NODQ-NEXT: vcvttss2si %xmm2, %rax -; NODQ-NEXT: vmovq %rax, %xmm2 -; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; NODQ-NEXT: vcvttss2si %xmm0, %rax -; NODQ-NEXT: vmovq %rax, %xmm2 -; NODQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; NODQ-NEXT: vcvttss2si %xmm0, %rax -; NODQ-NEXT: vmovq %rax, %xmm0 -; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: f32to4sl: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: f32to4sl: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; DQNOVL-NEXT: vcvttps2qq %ymm0, %zmm0 -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; DQNOVL-NEXT: retq - %b = fptosi <4 x float> %a to <4 x i64> - ret <4 x i64> %b -} - -define <4 x float> @slto4f32(<4 x i64> %a) { -; NODQ-LABEL: slto4f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; NODQ-NEXT: vzeroupper -; NODQ-NEXT: retq -; -; VLDQ-LABEL: slto4f32: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 -; VLDQ-NEXT: vzeroupper -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: slto4f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq - %b = sitofp <4 x i64> %a to <4 x float> - ret <4 x float> %b -} - -define <4 x float> @ulto4f32(<4 x i64> %a) { -; NODQ-LABEL: ulto4f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; NODQ-NEXT: vzeroupper -; NODQ-NEXT: retq -; -; VLDQ-LABEL: ulto4f32: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 -; VLDQ-NEXT: vzeroupper -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: ulto4f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq - %b = uitofp <4 x i64> %a to <4 x float> - ret <4 x float> %b -} - -define <8 x double> @ulto8f64(<8 x i64> %a) { -; NODQ-LABEL: ulto8f64: -; NODQ: # %bb.0: -; NODQ-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 -; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1 -; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 -; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; NODQ-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; NODQ-NEXT: vaddpd %zmm0, %zmm1, %zmm0 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: ulto8f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: ulto8f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvtuqq2pd %zmm0, %zmm0 -; DQNOVL-NEXT: retq - %b = uitofp <8 x i64> %a to <8 x double> - ret <8 x double> %b -} - -define <16 x double> @ulto16f64(<16 x i64> %a) { -; NODQ-LABEL: ulto16f64: -; NODQ: # %bb.0: -; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; NODQ-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; NODQ-NEXT: vporq %zmm4, %zmm3, %zmm3 -; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 -; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] -; NODQ-NEXT: vporq %zmm5, %zmm0, %zmm0 -; NODQ-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; NODQ-NEXT: vsubpd %zmm6, %zmm0, %zmm0 -; NODQ-NEXT: vaddpd %zmm0, %zmm3, %zmm0 -; NODQ-NEXT: vpandq %zmm2, %zmm1, %zmm2 -; NODQ-NEXT: vporq %zmm4, %zmm2, %zmm2 -; NODQ-NEXT: vpsrlq $32, %zmm1, %zmm1 -; NODQ-NEXT: vporq %zmm5, %zmm1, %zmm1 -; NODQ-NEXT: vsubpd %zmm6, %zmm1, %zmm1 -; NODQ-NEXT: vaddpd %zmm1, %zmm2, %zmm1 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: ulto16f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 -; VLDQ-NEXT: vcvtuqq2pd %zmm1, %zmm1 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: ulto16f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvtuqq2pd %zmm0, %zmm0 -; DQNOVL-NEXT: vcvtuqq2pd %zmm1, %zmm1 -; DQNOVL-NEXT: retq - %b = uitofp <16 x i64> %a to <16 x double> - ret <16 x double> %b -} - -define <16 x i32> @f64to16si(<16 x float> %a) nounwind { -; ALL-LABEL: f64to16si: -; ALL: # %bb.0: -; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 -; ALL-NEXT: retq - %b = fptosi <16 x float> %a to <16 x i32> - ret <16 x i32> %b -} - -define <16 x i8> @f32to16sc(<16 x float> %f) { -; ALL-LABEL: f32to16sc: -; ALL: # %bb.0: -; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 -; ALL-NEXT: vpmovdb %zmm0, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %res = fptosi <16 x float> %f to <16 x i8> - ret <16 x i8> %res -} - -define <16 x i16> @f32to16ss(<16 x float> %f) { -; ALL-LABEL: f32to16ss: -; ALL: # %bb.0: -; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 -; ALL-NEXT: vpmovdw %zmm0, %ymm0 -; ALL-NEXT: retq - %res = fptosi <16 x float> %f to <16 x i16> - ret <16 x i16> %res -} - -define <16 x i32> @f32to16ui(<16 x float> %a) nounwind { -; ALL-LABEL: f32to16ui: -; ALL: # %bb.0: -; ALL-NEXT: vcvttps2udq %zmm0, %zmm0 -; ALL-NEXT: retq - %b = fptoui <16 x float> %a to <16 x i32> - ret <16 x i32> %b -} - -define <16 x i8> @f32to16uc(<16 x float> %f) { -; ALL-LABEL: f32to16uc: -; ALL: # %bb.0: -; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 -; ALL-NEXT: vpmovdb %zmm0, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %res = fptoui <16 x float> %f to <16 x i8> - ret <16 x i8> %res -} - -define <16 x i16> @f32to16us(<16 x float> %f) { -; ALL-LABEL: f32to16us: -; ALL: # %bb.0: -; ALL-NEXT: vcvttps2dq %zmm0, %zmm0 -; ALL-NEXT: vpmovdw %zmm0, %ymm0 -; ALL-NEXT: retq - %res = fptoui <16 x float> %f to <16 x i16> - ret <16 x i16> %res -} - -define <8 x i32> @f32to8ui(<8 x float> %a) nounwind { -; NOVL-LABEL: f32to8ui: -; NOVL: # %bb.0: -; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0 -; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; NOVL-NEXT: retq -; -; VL-LABEL: f32to8ui: -; VL: # %bb.0: -; VL-NEXT: vcvttps2udq %ymm0, %ymm0 -; VL-NEXT: retq - %b = fptoui <8 x float> %a to <8 x i32> - ret <8 x i32> %b -} - -define <4 x i32> @f32to4ui(<4 x float> %a) nounwind { -; NOVL-LABEL: f32to4ui: -; NOVL: # %bb.0: -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVL-NEXT: vzeroupper -; NOVL-NEXT: retq -; -; VL-LABEL: f32to4ui: -; VL: # %bb.0: -; VL-NEXT: vcvttps2udq %xmm0, %xmm0 -; VL-NEXT: retq - %b = fptoui <4 x float> %a to <4 x i32> - ret <4 x i32> %b -} - -define <8 x i32> @f64to8ui(<8 x double> %a) nounwind { -; ALL-LABEL: f64to8ui: -; ALL: # %bb.0: -; ALL-NEXT: vcvttpd2udq %zmm0, %ymm0 -; ALL-NEXT: retq - %b = fptoui <8 x double> %a to <8 x i32> - ret <8 x i32> %b -} - -define <8 x i16> @f64to8us(<8 x double> %f) { -; NOVL-LABEL: f64to8us: -; NOVL: # %bb.0: -; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; NOVL-NEXT: vpmovdw %zmm0, %ymm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; NOVL-NEXT: vzeroupper -; NOVL-NEXT: retq -; -; VL-LABEL: f64to8us: -; VL: # %bb.0: -; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VL-NEXT: vpmovdw %ymm0, %xmm0 -; VL-NEXT: vzeroupper -; VL-NEXT: retq - %res = fptoui <8 x double> %f to <8 x i16> - ret <8 x i16> %res -} - -define <8 x i8> @f64to8uc(<8 x double> %f) { -; NOVL-LABEL: f64to8uc: -; NOVL: # %bb.0: -; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; NOVL-NEXT: vpmovdb %zmm0, %xmm0 -; NOVL-NEXT: vzeroupper -; NOVL-NEXT: retq -; -; VL-LABEL: f64to8uc: -; VL: # %bb.0: -; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VL-NEXT: vpmovdb %ymm0, %xmm0 -; VL-NEXT: vzeroupper -; VL-NEXT: retq - %res = fptoui <8 x double> %f to <8 x i8> - ret <8 x i8> %res -} - -define <4 x i32> @f64to4ui(<4 x double> %a) nounwind { -; NOVL-LABEL: f64to4ui: -; NOVL: # %bb.0: -; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; NOVL-NEXT: vzeroupper -; NOVL-NEXT: retq -; -; VL-LABEL: f64to4ui: -; VL: # %bb.0: -; VL-NEXT: vcvttpd2udq %ymm0, %xmm0 -; VL-NEXT: vzeroupper -; VL-NEXT: retq - %b = fptoui <4 x double> %a to <4 x i32> - ret <4 x i32> %b -} - -define <8 x double> @sito8f64(<8 x i32> %a) { -; ALL-LABEL: sito8f64: -; ALL: # %bb.0: -; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 -; ALL-NEXT: retq - %b = sitofp <8 x i32> %a to <8 x double> - ret <8 x double> %b -} -define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { -; KNL-LABEL: i32to8f64_mask: -; KNL: # %bb.0: -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} -; KNL-NEXT: retq -; -; VLBW-LABEL: i32to8f64_mask: -; VLBW: # %bb.0: -; VLBW-NEXT: kmovd %edi, %k1 -; VLBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} -; VLBW-NEXT: retq -; -; VLNOBW-LABEL: i32to8f64_mask: -; VLNOBW: # %bb.0: -; VLNOBW-NEXT: kmovw %edi, %k1 -; VLNOBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} -; VLNOBW-NEXT: retq -; -; DQNOVL-LABEL: i32to8f64_mask: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: kmovw %edi, %k1 -; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} -; DQNOVL-NEXT: retq -; -; AVX512BW-LABEL: i32to8f64_mask: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1} -; AVX512BW-NEXT: retq - %1 = bitcast i8 %c to <8 x i1> - %2 = sitofp <8 x i32> %b to <8 x double> - %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a - ret <8 x double> %3 -} -define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { -; KNL-LABEL: sito8f64_maskz: -; KNL: # %bb.0: -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} -; KNL-NEXT: retq -; -; VLBW-LABEL: sito8f64_maskz: -; VLBW: # %bb.0: -; VLBW-NEXT: kmovd %edi, %k1 -; VLBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} -; VLBW-NEXT: retq -; -; VLNOBW-LABEL: sito8f64_maskz: -; VLNOBW: # %bb.0: -; VLNOBW-NEXT: kmovw %edi, %k1 -; VLNOBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} -; VLNOBW-NEXT: retq -; -; DQNOVL-LABEL: sito8f64_maskz: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: kmovw %edi, %k1 -; DQNOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} -; DQNOVL-NEXT: retq -; -; AVX512BW-LABEL: sito8f64_maskz: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq - %1 = bitcast i8 %b to <8 x i1> - %2 = sitofp <8 x i32> %a to <8 x double> - %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer - ret <8 x double> %3 -} - -define <8 x i32> @f64to8si(<8 x double> %a) { -; ALL-LABEL: f64to8si: -; ALL: # %bb.0: -; ALL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; ALL-NEXT: retq - %b = fptosi <8 x double> %a to <8 x i32> - ret <8 x i32> %b -} - -define <8 x i16> @f64to8ss(<8 x double> %f) { -; NOVL-LABEL: f64to8ss: -; NOVL: # %bb.0: -; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; NOVL-NEXT: vpmovdw %zmm0, %ymm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; NOVL-NEXT: vzeroupper -; NOVL-NEXT: retq -; -; VL-LABEL: f64to8ss: -; VL: # %bb.0: -; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VL-NEXT: vpmovdw %ymm0, %xmm0 -; VL-NEXT: vzeroupper -; VL-NEXT: retq - %res = fptosi <8 x double> %f to <8 x i16> - ret <8 x i16> %res -} - -define <8 x i8> @f64to8sc(<8 x double> %f) { -; NOVL-LABEL: f64to8sc: -; NOVL: # %bb.0: -; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; NOVL-NEXT: vpmovdb %zmm0, %xmm0 -; NOVL-NEXT: vzeroupper -; NOVL-NEXT: retq -; -; VL-LABEL: f64to8sc: -; VL: # %bb.0: -; VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VL-NEXT: vpmovdb %ymm0, %xmm0 -; VL-NEXT: vzeroupper -; VL-NEXT: retq - %res = fptosi <8 x double> %f to <8 x i8> - ret <8 x i8> %res -} - -define <4 x i32> @f64to4si(<4 x double> %a) { -; ALL-LABEL: f64to4si: -; ALL: # %bb.0: -; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %b = fptosi <4 x double> %a to <4 x i32> - ret <4 x i32> %b -} - -define <16 x float> @f64to16f32(<16 x double> %b) nounwind { -; ALL-LABEL: f64to16f32: -; ALL: # %bb.0: -; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0 -; ALL-NEXT: vcvtpd2ps %zmm1, %ymm1 -; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; ALL-NEXT: retq - %a = fptrunc <16 x double> %b to <16 x float> - ret <16 x float> %a -} - -define <4 x float> @f64to4f32(<4 x double> %b) { -; ALL-LABEL: f64to4f32: -; ALL: # %bb.0: -; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %a = fptrunc <4 x double> %b to <4 x float> - ret <4 x float> %a -} - -define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) { -; NOVLDQ-LABEL: f64to4f32_mask: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: vpslld $31, %xmm1, %xmm1 -; NOVLDQ-NEXT: vptestmd %zmm1, %zmm1, %k1 -; NOVLDQ-NEXT: vcvtpd2ps %ymm0, %xmm0 -; NOVLDQ-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: f64to4f32_mask: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpslld $31, %xmm1, %xmm1 -; VLDQ-NEXT: vpmovd2m %xmm1, %k1 -; VLDQ-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} -; VLDQ-NEXT: vzeroupper -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: f64to4f32_mask: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpslld $31, %xmm1, %xmm1 -; VLNODQ-NEXT: vptestmd %xmm1, %xmm1, %k1 -; VLNODQ-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vzeroupper -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: f64to4f32_mask: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vpslld $31, %xmm1, %xmm1 -; DQNOVL-NEXT: vpmovd2m %zmm1, %k1 -; DQNOVL-NEXT: vcvtpd2ps %ymm0, %xmm0 -; DQNOVL-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z} -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq - %a = fptrunc <4 x double> %b to <4 x float> - %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer - ret <4 x float> %c -} - -define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { -; ALL-LABEL: f64tof32_inreg: -; ALL: # %bb.0: -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0 -; ALL-NEXT: retq - %ext = extractelement <2 x double> %a0, i32 0 - %cvt = fptrunc double %ext to float - %res = insertelement <4 x float> %a1, float %cvt, i32 0 - ret <4 x float> %res -} - -define <8 x double> @f32to8f64(<8 x float> %b) nounwind { -; ALL-LABEL: f32to8f64: -; ALL: # %bb.0: -; ALL-NEXT: vcvtps2pd %ymm0, %zmm0 -; ALL-NEXT: retq - %a = fpext <8 x float> %b to <8 x double> - ret <8 x double> %a -} - -define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) { -; NOVL-LABEL: f32to4f64_mask: -; NOVL: # %bb.0: -; NOVL-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 -; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; NOVL-NEXT: vcvtps2pd %xmm0, %ymm0 -; NOVL-NEXT: vcmpltpd %zmm2, %zmm1, %k1 -; NOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} -; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; NOVL-NEXT: retq -; -; VL-LABEL: f32to4f64_mask: -; VL: # %bb.0: -; VL-NEXT: vcmpltpd %ymm2, %ymm1, %k1 -; VL-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z} -; VL-NEXT: retq - %a = fpext <4 x float> %b to <4 x double> - %mask = fcmp ogt <4 x double> %a1, %b1 - %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer - ret <4 x double> %c -} - -define <4 x double> @f32to4f64_mask_load(<4 x float>* %p, <4 x double> %b1, <4 x double> %a1) { -; NOVL-LABEL: f32to4f64_mask_load: -; NOVL: # %bb.0: -; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NOVL-NEXT: vcvtps2pd (%rdi), %ymm2 -; NOVL-NEXT: vcmpltpd %zmm1, %zmm0, %k1 -; NOVL-NEXT: vmovapd %zmm2, %zmm0 {%k1} {z} -; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; NOVL-NEXT: retq -; -; VL-LABEL: f32to4f64_mask_load: -; VL: # %bb.0: -; VL-NEXT: vcmpltpd %ymm1, %ymm0, %k1 -; VL-NEXT: vcvtps2pd (%rdi), %ymm0 {%k1} {z} -; VL-NEXT: retq - %b = load <4 x float>, <4 x float>* %p - %a = fpext <4 x float> %b to <4 x double> - %mask = fcmp ogt <4 x double> %a1, %b1 - %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer - ret <4 x double> %c -} - -define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind { -; ALL-LABEL: f32tof64_inreg: -; ALL: # %bb.0: -; ALL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0 -; ALL-NEXT: retq - %ext = extractelement <4 x float> %a1, i32 0 - %cvt = fpext float %ext to double - %res = insertelement <2 x double> %a0, double %cvt, i32 0 - ret <2 x double> %res -} - -define double @sltof64_load(i64* nocapture %e) { -; ALL-LABEL: sltof64_load: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 -; ALL-NEXT: retq -entry: - %tmp1 = load i64, i64* %e, align 8 - %conv = sitofp i64 %tmp1 to double - ret double %conv -} - -define double @sitof64_load(i32* %e) { -; ALL-LABEL: sitof64_load: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 -; ALL-NEXT: retq -entry: - %tmp1 = load i32, i32* %e, align 4 - %conv = sitofp i32 %tmp1 to double - ret double %conv -} - -define float @sitof32_load(i32* %e) { -; ALL-LABEL: sitof32_load: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 -; ALL-NEXT: retq -entry: - %tmp1 = load i32, i32* %e, align 4 - %conv = sitofp i32 %tmp1 to float - ret float %conv -} - -define float @sltof32_load(i64* %e) { -; ALL-LABEL: sltof32_load: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 -; ALL-NEXT: retq -entry: - %tmp1 = load i64, i64* %e, align 8 - %conv = sitofp i64 %tmp1 to float - ret float %conv -} - -define void @f32tof64_loadstore() { -; ALL-LABEL: f32tof64_loadstore: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: retq -entry: - %f = alloca float, align 4 - %d = alloca double, align 8 - %tmp = load float, float* %f, align 4 - %conv = fpext float %tmp to double - store double %conv, double* %d, align 8 - ret void -} - -define void @f64tof32_loadstore() nounwind uwtable { -; ALL-LABEL: f64tof32_loadstore: -; ALL: # %bb.0: # %entry -; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0 -; ALL-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp) -; ALL-NEXT: retq -entry: - %f = alloca float, align 4 - %d = alloca double, align 8 - %tmp = load double, double* %d, align 8 - %conv = fptrunc double %tmp to float - store float %conv, float* %f, align 4 - ret void -} - -define double @long_to_double(i64 %x) { -; ALL-LABEL: long_to_double: -; ALL: # %bb.0: -; ALL-NEXT: vmovq %rdi, %xmm0 -; ALL-NEXT: retq - %res = bitcast i64 %x to double - ret double %res -} - -define i64 @double_to_long(double %x) { -; ALL-LABEL: double_to_long: -; ALL: # %bb.0: -; ALL-NEXT: vmovq %xmm0, %rax -; ALL-NEXT: retq - %res = bitcast double %x to i64 - ret i64 %res -} - -define float @int_to_float(i32 %x) { -; ALL-LABEL: int_to_float: -; ALL: # %bb.0: -; ALL-NEXT: vmovd %edi, %xmm0 -; ALL-NEXT: retq - %res = bitcast i32 %x to float - ret float %res -} - -define i32 @float_to_int(float %x) { -; ALL-LABEL: float_to_int: -; ALL: # %bb.0: -; ALL-NEXT: vmovd %xmm0, %eax -; ALL-NEXT: retq - %res = bitcast float %x to i32 - ret i32 %res -} - -define <16 x double> @uito16f64(<16 x i32> %a) nounwind { -; ALL-LABEL: uito16f64: -; ALL: # %bb.0: -; ALL-NEXT: vcvtudq2pd %ymm0, %zmm2 -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vcvtudq2pd %ymm0, %zmm1 -; ALL-NEXT: vmovaps %zmm2, %zmm0 -; ALL-NEXT: retq - %b = uitofp <16 x i32> %a to <16 x double> - ret <16 x double> %b -} - -define <8 x float> @slto8f32(<8 x i64> %a) { -; NODQ-LABEL: slto8f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: slto8f32: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtqq2ps %zmm0, %ymm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: slto8f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 -; DQNOVL-NEXT: retq - %b = sitofp <8 x i64> %a to <8 x float> - ret <8 x float> %b -} - -define <16 x float> @slto16f32(<16 x i64> %a) { -; NODQ-LABEL: slto16f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: slto16f32: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtqq2ps %zmm0, %ymm0 -; VLDQ-NEXT: vcvtqq2ps %zmm1, %ymm1 -; VLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: slto16f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0 -; DQNOVL-NEXT: vcvtqq2ps %zmm1, %ymm1 -; DQNOVL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; DQNOVL-NEXT: retq - %b = sitofp <16 x i64> %a to <16 x float> - ret <16 x float> %b -} - -define <8 x double> @slto8f64(<8 x i64> %a) { -; NODQ-LABEL: slto8f64: -; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: slto8f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtqq2pd %zmm0, %zmm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: slto8f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 -; DQNOVL-NEXT: retq - %b = sitofp <8 x i64> %a to <8 x double> - ret <8 x double> %b -} - -define <16 x double> @slto16f64(<16 x i64> %a) { -; NODQ-LABEL: slto16f64: -; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 -; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 -; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm2 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3 -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3 -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1 -; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 -; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: slto16f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtqq2pd %zmm0, %zmm0 -; VLDQ-NEXT: vcvtqq2pd %zmm1, %zmm1 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: slto16f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0 -; DQNOVL-NEXT: vcvtqq2pd %zmm1, %zmm1 -; DQNOVL-NEXT: retq - %b = sitofp <16 x i64> %a to <16 x double> - ret <16 x double> %b -} - -define <8 x float> @ulto8f32(<8 x i64> %a) { -; NODQ-LABEL: ulto8f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: ulto8f32: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: ulto8f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 -; DQNOVL-NEXT: retq - %b = uitofp <8 x i64> %a to <8 x float> - ret <8 x float> %b -} - -define <16 x float> @ulto16f32(<16 x i64> %a) { -; NODQ-LABEL: ulto16f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 -; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 -; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 -; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] -; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3 -; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] -; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 -; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] -; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 -; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] -; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: ulto16f32: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 -; VLDQ-NEXT: vcvtuqq2ps %zmm1, %ymm1 -; VLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: ulto16f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0 -; DQNOVL-NEXT: vcvtuqq2ps %zmm1, %ymm1 -; DQNOVL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 -; DQNOVL-NEXT: retq - %b = uitofp <16 x i64> %a to <16 x float> - ret <16 x float> %b -} - -define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind { -; KNL-LABEL: uito8f64_mask: -; KNL: # %bb.0: -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} -; KNL-NEXT: retq -; -; VLBW-LABEL: uito8f64_mask: -; VLBW: # %bb.0: -; VLBW-NEXT: kmovd %edi, %k1 -; VLBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} -; VLBW-NEXT: retq -; -; VLNOBW-LABEL: uito8f64_mask: -; VLNOBW: # %bb.0: -; VLNOBW-NEXT: kmovw %edi, %k1 -; VLNOBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} -; VLNOBW-NEXT: retq -; -; DQNOVL-LABEL: uito8f64_mask: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: kmovw %edi, %k1 -; DQNOVL-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} -; DQNOVL-NEXT: retq -; -; AVX512BW-LABEL: uito8f64_mask: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1} -; AVX512BW-NEXT: retq - %1 = bitcast i8 %c to <8 x i1> - %2 = uitofp <8 x i32> %b to <8 x double> - %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a - ret <8 x double> %3 -} -define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind { -; KNL-LABEL: uito8f64_maskz: -; KNL: # %bb.0: -; KNL-NEXT: kmovw %edi, %k1 -; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} -; KNL-NEXT: retq -; -; VLBW-LABEL: uito8f64_maskz: -; VLBW: # %bb.0: -; VLBW-NEXT: kmovd %edi, %k1 -; VLBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} -; VLBW-NEXT: retq -; -; VLNOBW-LABEL: uito8f64_maskz: -; VLNOBW: # %bb.0: -; VLNOBW-NEXT: kmovw %edi, %k1 -; VLNOBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} -; VLNOBW-NEXT: retq -; -; DQNOVL-LABEL: uito8f64_maskz: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: kmovw %edi, %k1 -; DQNOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} -; DQNOVL-NEXT: retq -; -; AVX512BW-LABEL: uito8f64_maskz: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq - %1 = bitcast i8 %b to <8 x i1> - %2 = uitofp <8 x i32> %a to <8 x double> - %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer - ret <8 x double> %3 -} - -define <4 x double> @uito4f64(<4 x i32> %a) nounwind { -; NOVL-LABEL: uito4f64: -; NOVL: # %bb.0: -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 -; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; NOVL-NEXT: retq -; -; VL-LABEL: uito4f64: -; VL: # %bb.0: -; VL-NEXT: vcvtudq2pd %xmm0, %ymm0 -; VL-NEXT: retq - %b = uitofp <4 x i32> %a to <4 x double> - ret <4 x double> %b -} - -define <16 x float> @uito16f32(<16 x i32> %a) nounwind { -; ALL-LABEL: uito16f32: -; ALL: # %bb.0: -; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0 -; ALL-NEXT: retq - %b = uitofp <16 x i32> %a to <16 x float> - ret <16 x float> %b -} - -define <8 x double> @uito8f64(<8 x i32> %a) { -; ALL-LABEL: uito8f64: -; ALL: # %bb.0: -; ALL-NEXT: vcvtudq2pd %ymm0, %zmm0 -; ALL-NEXT: retq - %b = uitofp <8 x i32> %a to <8 x double> - ret <8 x double> %b -} - -define <8 x float> @uito8f32(<8 x i32> %a) nounwind { -; NOVL-LABEL: uito8f32: -; NOVL: # %bb.0: -; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0 -; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; NOVL-NEXT: retq -; -; VL-LABEL: uito8f32: -; VL: # %bb.0: -; VL-NEXT: vcvtudq2ps %ymm0, %ymm0 -; VL-NEXT: retq - %b = uitofp <8 x i32> %a to <8 x float> - ret <8 x float> %b -} - -define <4 x float> @uito4f32(<4 x i32> %a) nounwind { -; NOVL-LABEL: uito4f32: -; NOVL: # %bb.0: -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0 -; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVL-NEXT: vzeroupper -; NOVL-NEXT: retq -; -; VL-LABEL: uito4f32: -; VL: # %bb.0: -; VL-NEXT: vcvtudq2ps %xmm0, %xmm0 -; VL-NEXT: retq - %b = uitofp <4 x i32> %a to <4 x float> - ret <4 x float> %b -} - -define i32 @fptosi(float %a) nounwind { -; ALL-LABEL: fptosi: -; ALL: # %bb.0: -; ALL-NEXT: vcvttss2si %xmm0, %eax -; ALL-NEXT: retq - %b = fptosi float %a to i32 - ret i32 %b -} - -define i32 @fptoui(float %a) nounwind { -; ALL-LABEL: fptoui: -; ALL: # %bb.0: -; ALL-NEXT: vcvttss2usi %xmm0, %eax -; ALL-NEXT: retq - %b = fptoui float %a to i32 - ret i32 %b -} - -define float @uitof32(i32 %a) nounwind { -; ALL-LABEL: uitof32: -; ALL: # %bb.0: -; ALL-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 -; ALL-NEXT: retq - %b = uitofp i32 %a to float - ret float %b -} - -define double @uitof64(i32 %a) nounwind { -; ALL-LABEL: uitof64: -; ALL: # %bb.0: -; ALL-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 -; ALL-NEXT: retq - %b = uitofp i32 %a to double - ret double %b -} - -define <16 x float> @sbto16f32(<16 x i32> %a) { -; NODQ-LABEL: sbto16f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: sbto16f32: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovd2m %zmm0, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %zmm0 -; VLDQ-NEXT: vcvtdq2ps %zmm0, %zmm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: sbto16f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vcvtdq2ps %zmm0, %zmm0 -; DQNOVL-NEXT: retq - %mask = icmp slt <16 x i32> %a, zeroinitializer - %1 = sitofp <16 x i1> %mask to <16 x float> - ret <16 x float> %1 -} - -define <16 x float> @scto16f32(<16 x i8> %a) { -; ALL-LABEL: scto16f32: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbd %xmm0, %zmm0 -; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 -; ALL-NEXT: retq - %1 = sitofp <16 x i8> %a to <16 x float> - ret <16 x float> %1 -} - -define <16 x float> @ssto16f32(<16 x i16> %a) { -; ALL-LABEL: ssto16f32: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxwd %ymm0, %zmm0 -; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 -; ALL-NEXT: retq - %1 = sitofp <16 x i16> %a to <16 x float> - ret <16 x float> %1 -} - -define <8 x double> @ssto16f64(<8 x i16> %a) { -; ALL-LABEL: ssto16f64: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxwd %xmm0, %ymm0 -; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 -; ALL-NEXT: retq - %1 = sitofp <8 x i16> %a to <8 x double> - ret <8 x double> %1 -} - -define <8 x double> @scto8f64(<8 x i8> %a) { -; ALL-LABEL: scto8f64: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbd %xmm0, %ymm0 -; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 -; ALL-NEXT: retq - %1 = sitofp <8 x i8> %a to <8 x double> - ret <8 x double> %1 -} - -define <16 x double> @scto16f64(<16 x i8> %a) { -; ALL-LABEL: scto16f64: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxbd %xmm0, %zmm1 -; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 -; ALL-NEXT: retq - %b = sitofp <16 x i8> %a to <16 x double> - ret <16 x double> %b -} - -define <16 x double> @sbto16f64(<16 x double> %a) { -; NODQ-LABEL: sbto16f64: -; NODQ: # %bb.0: -; NODQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; NODQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0 -; NODQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 -; NODQ-NEXT: kunpckbw %k0, %k1, %k1 -; NODQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z} -; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm0 -; NODQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: sbto16f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; VLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0 -; VLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1 -; VLDQ-NEXT: kunpckbw %k0, %k1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %zmm1 -; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm0 -; VLDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: sbto16f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; DQNOVL-NEXT: vcmpltpd %zmm0, %zmm2, %k0 -; DQNOVL-NEXT: vcmpltpd %zmm1, %zmm2, %k1 -; DQNOVL-NEXT: kunpckbw %k0, %k1, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm1 -; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0 -; DQNOVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm1 -; DQNOVL-NEXT: retq - %cmpres = fcmp ogt <16 x double> %a, zeroinitializer - %1 = sitofp <16 x i1> %cmpres to <16 x double> - ret <16 x double> %1 -} - -define <8 x double> @sbto8f64(<8 x double> %a) { -; NOVLDQ-LABEL: sbto8f64: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: sbto8f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %ymm0 -; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: sbto8f64: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} -; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0 -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: sbto8f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; DQNOVL-NEXT: vcmpltpd %zmm0, %zmm1, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 -; DQNOVL-NEXT: retq - %cmpres = fcmp ogt <8 x double> %a, zeroinitializer - %1 = sitofp <8 x i1> %cmpres to <8 x double> - ret <8 x double> %1 -} - -define <8 x float> @sbto8f32(<8 x float> %a) { -; ALL-LABEL: sbto8f32: -; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcmpltps %ymm0, %ymm1, %ymm0 -; ALL-NEXT: vcvtdq2ps %ymm0, %ymm0 -; ALL-NEXT: retq - %cmpres = fcmp ogt <8 x float> %a, zeroinitializer - %1 = sitofp <8 x i1> %cmpres to <8 x float> - ret <8 x float> %1 -} - -define <4 x float> @sbto4f32(<4 x float> %a) { -; ALL-LABEL: sbto4f32: -; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; ALL-NEXT: vcvtdq2ps %xmm0, %xmm0 -; ALL-NEXT: retq - %cmpres = fcmp ogt <4 x float> %a, zeroinitializer - %1 = sitofp <4 x i1> %cmpres to <4 x float> - ret <4 x float> %1 -} - -define <4 x double> @sbto4f64(<4 x double> %a) { -; NOVL-LABEL: sbto4f64: -; NOVL: # %bb.0: -; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0 -; NOVL-NEXT: vpmovqd %zmm0, %ymm0 -; NOVL-NEXT: vcvtdq2pd %xmm0, %ymm0 -; NOVL-NEXT: retq -; -; VLDQ-LABEL: sbto4f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltpd %ymm0, %ymm1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %xmm0 -; VLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: sbto4f64: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltpd %ymm0, %ymm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0 -; VLNODQ-NEXT: retq - %cmpres = fcmp ogt <4 x double> %a, zeroinitializer - %1 = sitofp <4 x i1> %cmpres to <4 x double> - ret <4 x double> %1 -} - -define <2 x float> @sbto2f32(<2 x float> %a) { -; ALL-LABEL: sbto2f32: -; ALL: # %bb.0: -; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0 -; ALL-NEXT: vcvtdq2ps %xmm0, %xmm0 -; ALL-NEXT: retq - %cmpres = fcmp ogt <2 x float> %a, zeroinitializer - %1 = sitofp <2 x i1> %cmpres to <2 x float> - ret <2 x float> %1 -} - -define <2 x double> @sbto2f64(<2 x double> %a) { -; NOVL-LABEL: sbto2f64: -; NOVL: # %bb.0: -; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 -; NOVL-NEXT: retq -; -; VLDQ-LABEL: sbto2f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %xmm0 -; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: sbto2f64: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1 -; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} -; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VLNODQ-NEXT: retq - %cmpres = fcmp ogt <2 x double> %a, zeroinitializer - %1 = sitofp <2 x i1> %cmpres to <2 x double> - ret <2 x double> %1 -} - -define <16 x float> @ucto16f32(<16 x i8> %a) { -; ALL-LABEL: ucto16f32: -; ALL: # %bb.0: -; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 -; ALL-NEXT: retq - %b = uitofp <16 x i8> %a to <16 x float> - ret <16 x float>%b -} - -define <8 x double> @ucto8f64(<8 x i8> %a) { -; ALL-LABEL: ucto8f64: -; ALL: # %bb.0: -; ALL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 -; ALL-NEXT: retq - %b = uitofp <8 x i8> %a to <8 x double> - ret <8 x double> %b -} - -define <16 x float> @swto16f32(<16 x i16> %a) { -; ALL-LABEL: swto16f32: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxwd %ymm0, %zmm0 -; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 -; ALL-NEXT: retq - %b = sitofp <16 x i16> %a to <16 x float> - ret <16 x float> %b -} - -define <8 x double> @swto8f64(<8 x i16> %a) { -; ALL-LABEL: swto8f64: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxwd %xmm0, %ymm0 -; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 -; ALL-NEXT: retq - %b = sitofp <8 x i16> %a to <8 x double> - ret <8 x double> %b -} - -define <16 x double> @swto16f64(<16 x i16> %a) { -; ALL-LABEL: swto16f64: -; ALL: # %bb.0: -; ALL-NEXT: vpmovsxwd %ymm0, %zmm1 -; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 -; ALL-NEXT: retq - %b = sitofp <16 x i16> %a to <16 x double> - ret <16 x double> %b -} - -define <16 x double> @ucto16f64(<16 x i8> %a) { -; ALL-LABEL: ucto16f64: -; ALL: # %bb.0: -; ALL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 -; ALL-NEXT: retq - %b = uitofp <16 x i8> %a to <16 x double> - ret <16 x double> %b -} - -define <16 x float> @uwto16f32(<16 x i16> %a) { -; ALL-LABEL: uwto16f32: -; ALL: # %bb.0: -; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 -; ALL-NEXT: retq - %b = uitofp <16 x i16> %a to <16 x float> - ret <16 x float> %b -} - -define <8 x double> @uwto8f64(<8 x i16> %a) { -; ALL-LABEL: uwto8f64: -; ALL: # %bb.0: -; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 -; ALL-NEXT: retq - %b = uitofp <8 x i16> %a to <8 x double> - ret <8 x double> %b -} - -define <16 x double> @uwto16f64(<16 x i16> %a) { -; ALL-LABEL: uwto16f64: -; ALL: # %bb.0: -; ALL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0 -; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1 -; ALL-NEXT: retq - %b = uitofp <16 x i16> %a to <16 x double> - ret <16 x double> %b -} - -define <16 x float> @sito16f32(<16 x i32> %a) { -; ALL-LABEL: sito16f32: -; ALL: # %bb.0: -; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 -; ALL-NEXT: retq - %b = sitofp <16 x i32> %a to <16 x float> - ret <16 x float> %b -} - -define <16 x double> @sito16f64(<16 x i32> %a) { -; ALL-LABEL: sito16f64: -; ALL: # %bb.0: -; ALL-NEXT: vcvtdq2pd %ymm0, %zmm2 -; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 -; ALL-NEXT: vcvtdq2pd %ymm0, %zmm1 -; ALL-NEXT: vmovaps %zmm2, %zmm0 -; ALL-NEXT: retq - %b = sitofp <16 x i32> %a to <16 x double> - ret <16 x double> %b -} - -define <16 x float> @usto16f32(<16 x i16> %a) { -; ALL-LABEL: usto16f32: -; ALL: # %bb.0: -; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0 -; ALL-NEXT: retq - %b = uitofp <16 x i16> %a to <16 x float> - ret <16 x float> %b -} - -define <16 x float> @ubto16f32(<16 x i32> %a) { -; NODQ-LABEL: ubto16f32: -; NODQ: # %bb.0: -; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NODQ-NEXT: vpsrld $31, %zmm0, %zmm0 -; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: ubto16f32: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovd2m %zmm0, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %zmm0 -; VLDQ-NEXT: vpsrld $31, %zmm0, %zmm0 -; VLDQ-NEXT: vcvtdq2ps %zmm0, %zmm0 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: ubto16f32: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vpsrld $31, %zmm0, %zmm0 -; DQNOVL-NEXT: vcvtdq2ps %zmm0, %zmm0 -; DQNOVL-NEXT: retq - %mask = icmp slt <16 x i32> %a, zeroinitializer - %1 = uitofp <16 x i1> %mask to <16 x float> - ret <16 x float> %1 -} - -define <16 x double> @ubto16f64(<16 x i32> %a) { -; NODQ-LABEL: ubto16f64: -; NODQ: # %bb.0: -; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1 -; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; NODQ-NEXT: vpsrld $31, %zmm0, %zmm1 -; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm0 -; NODQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm1 -; NODQ-NEXT: retq -; -; VLDQ-LABEL: ubto16f64: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovd2m %zmm0, %k0 -; VLDQ-NEXT: vpmovm2d %k0, %zmm0 -; VLDQ-NEXT: vpsrld $31, %zmm0, %zmm1 -; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm0 -; VLDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1 -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: ubto16f64: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vpmovd2m %zmm0, %k0 -; DQNOVL-NEXT: vpmovm2d %k0, %zmm0 -; DQNOVL-NEXT: vpsrld $31, %zmm0, %zmm1 -; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0 -; DQNOVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm1 -; DQNOVL-NEXT: retq - %mask = icmp slt <16 x i32> %a, zeroinitializer - %1 = uitofp <16 x i1> %mask to <16 x double> - ret <16 x double> %1 -} - -define <8 x float> @ubto8f32(<8 x i32> %a) { -; NOVL-LABEL: ubto8f32: -; NOVL: # %bb.0: -; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 -; NOVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216] -; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; NOVL-NEXT: retq -; -; VL-LABEL: ubto8f32: -; VL: # %bb.0: -; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0 -; VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; VL-NEXT: retq - %mask = icmp slt <8 x i32> %a, zeroinitializer - %1 = uitofp <8 x i1> %mask to <8 x float> - ret <8 x float> %1 -} - -define <8 x double> @ubto8f64(<8 x i32> %a) { -; ALL-LABEL: ubto8f64: -; ALL: # %bb.0: -; ALL-NEXT: vpsrld $31, %ymm0, %ymm0 -; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0 -; ALL-NEXT: retq - %mask = icmp slt <8 x i32> %a, zeroinitializer - %1 = uitofp <8 x i1> %mask to <8 x double> - ret <8 x double> %1 -} - -define <4 x float> @ubto4f32(<4 x i32> %a) { -; NOVL-LABEL: ubto4f32: -; NOVL: # %bb.0: -; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] -; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0 -; NOVL-NEXT: retq -; -; VL-LABEL: ubto4f32: -; VL: # %bb.0: -; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0 -; VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; VL-NEXT: retq - %mask = icmp slt <4 x i32> %a, zeroinitializer - %1 = uitofp <4 x i1> %mask to <4 x float> - ret <4 x float> %1 -} - -define <4 x double> @ubto4f64(<4 x i32> %a) { -; ALL-LABEL: ubto4f64: -; ALL: # %bb.0: -; ALL-NEXT: vpsrld $31, %xmm0, %xmm0 -; ALL-NEXT: vcvtdq2pd %xmm0, %ymm0 -; ALL-NEXT: retq - %mask = icmp slt <4 x i32> %a, zeroinitializer - %1 = uitofp <4 x i1> %mask to <4 x double> - ret <4 x double> %1 -} - -define <2 x float> @ubto2f32(<2 x i32> %a) { -; NOVL-LABEL: ubto2f32: -; NOVL: # %bb.0: -; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216] -; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; NOVL-NEXT: retq -; -; VL-LABEL: ubto2f32: -; VL: # %bb.0: -; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; VL-NEXT: retq - %mask = icmp ne <2 x i32> %a, zeroinitializer - %1 = uitofp <2 x i1> %mask to <2 x float> - ret <2 x float> %1 -} - -define <2 x double> @ubto2f64(<2 x i32> %a) { -; NOVL-LABEL: ubto2f64: -; NOVL: # %bb.0: -; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1] -; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0 -; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0 -; NOVL-NEXT: retq -; -; VL-LABEL: ubto2f64: -; VL: # %bb.0: -; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 -; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0 -; VL-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VL-NEXT: retq - %mask = icmp ne <2 x i32> %a, zeroinitializer - %1 = uitofp <2 x i1> %mask to <2 x double> - ret <2 x double> %1 -} - -define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) { -; NOVLDQ-LABEL: test_2f64toub: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NOVLDQ-NEXT: vcvttpd2udq %zmm0, %ymm0 -; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 -; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_2f64toub: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0 -; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VLDQ-NEXT: vpmovd2m %xmm0, %k1 -; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_2f64toub: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vcvttpd2udq %xmm0, %xmm0 -; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 -; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_2f64toub: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; DQNOVL-NEXT: vcvttpd2udq %zmm0, %ymm0 -; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq - %mask = fptoui <2 x double> %a to <2 x i1> - %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer - ret <2 x i64> %select -} - -define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) { -; NOVLDQ-LABEL: test_4f64toub: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 -; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 -; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_4f64toub: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 -; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VLDQ-NEXT: vpmovd2m %xmm0, %k1 -; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_4f64toub: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0 -; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 -; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_4f64toub: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0 -; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; DQNOVL-NEXT: retq - %mask = fptoui <4 x double> %a to <4 x i1> - %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer - ret <4 x i64> %select -} - -define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) { -; NOVLDQ-LABEL: test_8f64toub: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 -; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_8f64toub: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 -; VLDQ-NEXT: vpmovd2m %ymm0, %k1 -; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_8f64toub: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0 -; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 -; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_8f64toub: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: retq - %mask = fptoui <8 x double> %a to <8 x i1> - %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer - ret <8 x i64> %select -} - -define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) { -; NOVLDQ-LABEL: test_2f32toub: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 -; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_2f32toub: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VLDQ-NEXT: vpmovd2m %xmm0, %k1 -; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_2f32toub: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 -; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_2f32toub: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 -; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq - %mask = fptoui <2 x float> %a to <2 x i1> - %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer - ret <2 x i64> %select -} - -define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) { -; NOVLDQ-LABEL: test_4f32toub: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 -; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_4f32toub: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VLDQ-NEXT: vpmovd2m %xmm0, %k1 -; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_4f32toub: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 -; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_4f32toub: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 -; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; DQNOVL-NEXT: retq - %mask = fptoui <4 x float> %a to <4 x i1> - %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer - ret <4 x i64> %select -} - -define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) { -; NOVLDQ-LABEL: test_8f32toub: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0 -; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_8f32toub: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0 -; VLDQ-NEXT: vpmovd2m %ymm0, %k1 -; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_8f32toub: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0 -; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 -; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_8f32toub: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0 -; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: retq - %mask = fptoui <8 x float> %a to <8 x i1> - %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer - ret <8 x i64> %select -} - -define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) { -; NODQ-LABEL: test_16f32toub: -; NODQ: # %bb.0: -; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0 -; NODQ-NEXT: vpslld $31, %zmm0, %zmm0 -; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} -; NODQ-NEXT: retq -; -; VLDQ-LABEL: test_16f32toub: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0 -; VLDQ-NEXT: vpslld $31, %zmm0, %zmm0 -; VLDQ-NEXT: vpmovd2m %zmm0, %k1 -; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: test_16f32toub: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0 -; DQNOVL-NEXT: vpslld $31, %zmm0, %zmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: retq - %mask = fptoui <16 x float> %a to <16 x i1> - %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer - ret <16 x i32> %select -} - -define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) { -; NOVLDQ-LABEL: test_2f64tosb: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; NOVLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0 -; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_2f64tosb: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VLDQ-NEXT: vpmovd2m %xmm0, %k1 -; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_2f64tosb: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vcvttpd2dq %xmm0, %xmm0 -; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0 -; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 -; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_2f64tosb: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; DQNOVL-NEXT: vcvttpd2dq %xmm0, %xmm0 -; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq - %mask = fptosi <2 x double> %a to <2 x i1> - %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer - ret <2 x i64> %select -} - -define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) { -; NOVLDQ-LABEL: test_4f64tosb: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 -; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_4f64tosb: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0 -; VLDQ-NEXT: vpmovd2m %xmm0, %k1 -; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_4f64tosb: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0 -; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 -; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_4f64tosb: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; DQNOVL-NEXT: retq - %mask = fptosi <4 x double> %a to <4 x i1> - %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer - ret <4 x i64> %select -} - -define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) { -; NOVLDQ-LABEL: test_8f64tosb: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_8f64tosb: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VLDQ-NEXT: vpmovd2m %ymm0, %k1 -; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_8f64tosb: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 -; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_8f64tosb: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: retq - %mask = fptosi <8 x double> %a to <8 x i1> - %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer - ret <8 x i64> %select -} - -define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) { -; NOVLDQ-LABEL: test_2f32tosb: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_2f32tosb: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; VLDQ-NEXT: vpmovd2m %xmm0, %k1 -; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_2f32tosb: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 -; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_2f32tosb: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq - %mask = fptosi <2 x float> %a to <2 x i1> - %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer - ret <2 x i64> %select -} - -define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) { -; NOVLDQ-LABEL: test_4f32tosb: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_4f32tosb: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; VLDQ-NEXT: vpmovd2m %xmm0, %k1 -; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_4f32tosb: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0 -; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1 -; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_4f32tosb: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; DQNOVL-NEXT: retq - %mask = fptosi <4 x float> %a to <4 x i1> - %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer - ret <4 x i64> %select -} - -define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) { -; NOVLDQ-LABEL: test_8f32tosb: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_8f32tosb: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; VLDQ-NEXT: vpmovd2m %ymm0, %k1 -; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_8f32tosb: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0 -; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1 -; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_8f32tosb: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: retq - %mask = fptosi <8 x float> %a to <8 x i1> - %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer - ret <8 x i64> %select -} - -define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) { -; NODQ-LABEL: test_16f32tosb: -; NODQ: # %bb.0: -; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0 -; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1 -; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} -; NODQ-NEXT: retq -; -; VLDQ-LABEL: test_16f32tosb: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0 -; VLDQ-NEXT: vpmovd2m %zmm0, %k1 -; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; DQNOVL-LABEL: test_16f32tosb: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0 -; DQNOVL-NEXT: vpmovd2m %zmm0, %k1 -; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z} -; DQNOVL-NEXT: retq - %mask = fptosi <16 x float> %a to <16 x i1> - %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer - ret <16 x i32> %select -} - -define <2 x double> @test_sito2f64_mask_load(<2 x i32> *%a, <2 x i64> %c) { -; SSE-LABEL: sitofp_load_2i32_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_load_2i32_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 -; AVX-NEXT: retq -; NOVLDQ-LABEL: test_sito2f64_mask_load: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vcvtdq2pd (%rdi), %xmm0 -; NOVLDQ-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_sito2f64_mask_load: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovq2m %xmm0, %k1 -; VLDQ-NEXT: vcvtdq2pd (%rdi), %xmm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_sito2f64_mask_load: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 -; VLNODQ-NEXT: vcvtdq2pd (%rdi), %xmm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_sito2f64_mask_load: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; DQNOVL-NEXT: vpmovq2m %zmm0, %k1 -; DQNOVL-NEXT: vcvtdq2pd (%rdi), %xmm0 -; DQNOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq - %mask = icmp slt <2 x i64> %c, zeroinitializer - %ld = load <2 x i32>, <2 x i32> *%a - %cvt = sitofp <2 x i32> %ld to <2 x double> - %sel = select <2 x i1> %mask, <2 x double> %cvt, <2 x double> zeroinitializer - ret <2 x double> %sel -} - -define <2 x double> @test_uito2f64_mask_load(<2 x i32> *%a, <2 x i64> %c) { -; SSE-LABEL: sitofp_load_2i32_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_load_2i32_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 -; AVX-NEXT: retq -; NOVLDQ-LABEL: test_uito2f64_mask_load: -; NOVLDQ: # %bb.0: -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; NOVLDQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k1 -; NOVLDQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; NOVLDQ-NEXT: vcvtudq2pd %ymm0, %zmm0 -; NOVLDQ-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} -; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; NOVLDQ-NEXT: vzeroupper -; NOVLDQ-NEXT: retq -; -; VLDQ-LABEL: test_uito2f64_mask_load: -; VLDQ: # %bb.0: -; VLDQ-NEXT: vpmovq2m %xmm0, %k1 -; VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0 {%k1} {z} -; VLDQ-NEXT: retq -; -; VLNODQ-LABEL: test_uito2f64_mask_load: -; VLNODQ: # %bb.0: -; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VLNODQ-NEXT: vpcmpgtq %xmm0, %xmm1, %k1 -; VLNODQ-NEXT: vcvtudq2pd (%rdi), %xmm0 {%k1} {z} -; VLNODQ-NEXT: retq -; -; DQNOVL-LABEL: test_uito2f64_mask_load: -; DQNOVL: # %bb.0: -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; DQNOVL-NEXT: vpmovq2m %zmm0, %k1 -; DQNOVL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; DQNOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 -; DQNOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z} -; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; DQNOVL-NEXT: vzeroupper -; DQNOVL-NEXT: retq - %mask = icmp slt <2 x i64> %c, zeroinitializer - %ld = load <2 x i32>, <2 x i32> *%a - %cvt = uitofp <2 x i32> %ld to <2 x double> - %sel = select <2 x i1> %mask, <2 x double> %cvt, <2 x double> zeroinitializer - ret <2 x double> %sel -} diff --git a/test/CodeGen/X86/avx512-trunc-widen.ll b/test/CodeGen/X86/avx512-trunc-widen.ll deleted file mode 100644 index 1ce08c01773..00000000000 --- a/test/CodeGen/X86/avx512-trunc-widen.ll +++ /dev/null @@ -1,1035 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,SKX - - attributes #0 = { nounwind } - -define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) #0 { -; ALL-LABEL: trunc_16x32_to_16x8: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovdb %zmm0, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x = trunc <16 x i32> %i to <16 x i8> - ret <16 x i8> %x -} - -define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) #0 { -; ALL-LABEL: trunc_8x64_to_8x16: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovqw %zmm0, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x = trunc <8 x i64> %i to <8 x i16> - ret <8 x i16> %x -} - -define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) #0 { -; ALL-LABEL: trunc_v16i32_to_v16i16: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovdw %zmm0, %ymm0 -; ALL-NEXT: retq - %1 = trunc <16 x i32> %x to <16 x i16> - ret <16 x i16> %1 -} - -define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 { -; ALL-LABEL: trunc_qb_512: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovqb %zmm0, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x = trunc <8 x i64> %i to <8 x i8> - ret <8 x i8> %x -} - -define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 { -; ALL-LABEL: trunc_qb_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovqb %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x = trunc <8 x i64> %i to <8 x i8> - store <8 x i8> %x, <8 x i8>* %res - ret void -} - -define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 { -; KNL-LABEL: trunc_qb_256: -; KNL: ## %bb.0: -; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovqb %zmm0, %xmm0 -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_qb_256: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovqb %ymm0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <4 x i64> %i to <4 x i8> - ret <4 x i8> %x -} - -define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 { -; KNL-LABEL: trunc_qb_256_mem: -; KNL: ## %bb.0: -; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovqb %zmm0, %xmm0 -; KNL-NEXT: vmovd %xmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_qb_256_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovqb %ymm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <4 x i64> %i to <4 x i8> - store <4 x i8> %x, <4 x i8>* %res - ret void -} - -define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 { -; ALL-LABEL: trunc_qb_128: -; ALL: ## %bb.0: -; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; ALL-NEXT: retq - %x = trunc <2 x i64> %i to <2 x i8> - ret <2 x i8> %x -} - -define void @trunc_qb_128_mem(<2 x i64> %i, <2 x i8>* %res) #0 { -; KNL-LABEL: trunc_qb_128_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; KNL-NEXT: vpextrw $0, %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_qb_128_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovqb %xmm0, (%rdi) -; SKX-NEXT: retq - %x = trunc <2 x i64> %i to <2 x i8> - store <2 x i8> %x, <2 x i8>* %res - ret void -} - -define <8 x i16> @trunc_qw_512(<8 x i64> %i) #0 { -; ALL-LABEL: trunc_qw_512: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovqw %zmm0, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x = trunc <8 x i64> %i to <8 x i16> - ret <8 x i16> %x -} - -define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 { -; ALL-LABEL: trunc_qw_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovqw %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x = trunc <8 x i64> %i to <8 x i16> - store <8 x i16> %x, <8 x i16>* %res - ret void -} - -define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 { -; KNL-LABEL: trunc_qw_256: -; KNL: ## %bb.0: -; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovqw %zmm0, %xmm0 -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_qw_256: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovqw %ymm0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <4 x i64> %i to <4 x i16> - ret <4 x i16> %x -} - -define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 { -; KNL-LABEL: trunc_qw_256_mem: -; KNL: ## %bb.0: -; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovqw %zmm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_qw_256_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovqw %ymm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <4 x i64> %i to <4 x i16> - store <4 x i16> %x, <4 x i16>* %res - ret void -} - -define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 { -; KNL-LABEL: trunc_qw_128: -; KNL: ## %bb.0: -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_qw_128: -; SKX: ## %bb.0: -; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; SKX-NEXT: retq - %x = trunc <2 x i64> %i to <2 x i16> - ret <2 x i16> %x -} - -define void @trunc_qw_128_mem(<2 x i64> %i, <2 x i16>* %res) #0 { -; KNL-LABEL: trunc_qw_128_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; KNL-NEXT: vmovd %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_qw_128_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovqw %xmm0, (%rdi) -; SKX-NEXT: retq - %x = trunc <2 x i64> %i to <2 x i16> - store <2 x i16> %x, <2 x i16>* %res - ret void -} - -define <8 x i32> @trunc_qd_512(<8 x i64> %i) #0 { -; ALL-LABEL: trunc_qd_512: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovqd %zmm0, %ymm0 -; ALL-NEXT: retq - %x = trunc <8 x i64> %i to <8 x i32> - ret <8 x i32> %x -} - -define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 { -; ALL-LABEL: trunc_qd_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovqd %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x = trunc <8 x i64> %i to <8 x i32> - store <8 x i32> %x, <8 x i32>* %res - ret void -} - -define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 { -; KNL-LABEL: trunc_qd_256: -; KNL: ## %bb.0: -; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovqd %zmm0, %ymm0 -; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_qd_256: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovqd %ymm0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <4 x i64> %i to <4 x i32> - ret <4 x i32> %x -} - -define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 { -; KNL-LABEL: trunc_qd_256_mem: -; KNL: ## %bb.0: -; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovqd %zmm0, %ymm0 -; KNL-NEXT: vmovdqa %xmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_qd_256_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovqd %ymm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <4 x i64> %i to <4 x i32> - store <4 x i32> %x, <4 x i32>* %res - ret void -} - -define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 { -; ALL-LABEL: trunc_qd_128: -; ALL: ## %bb.0: -; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; ALL-NEXT: retq - %x = trunc <2 x i64> %i to <2 x i32> - ret <2 x i32> %x -} - -define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 { -; KNL-LABEL: trunc_qd_128_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] -; KNL-NEXT: vmovlps %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_qd_128_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovqd %xmm0, (%rdi) -; SKX-NEXT: retq - %x = trunc <2 x i64> %i to <2 x i32> - store <2 x i32> %x, <2 x i32>* %res - ret void -} - -define <16 x i8> @trunc_db_512(<16 x i32> %i) #0 { -; ALL-LABEL: trunc_db_512: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovdb %zmm0, %xmm0 -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x = trunc <16 x i32> %i to <16 x i8> - ret <16 x i8> %x -} - -define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 { -; ALL-LABEL: trunc_db_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovdb %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x = trunc <16 x i32> %i to <16 x i8> - store <16 x i8> %x, <16 x i8>* %res - ret void -} - -define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 { -; KNL-LABEL: trunc_db_256: -; KNL: ## %bb.0: -; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_db_256: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovdb %ymm0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <8 x i32> %i to <8 x i8> - ret <8 x i8> %x -} - -define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 { -; KNL-LABEL: trunc_db_256_mem: -; KNL: ## %bb.0: -; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_db_256_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovdb %ymm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <8 x i32> %i to <8 x i8> - store <8 x i8> %x, <8 x i8>* %res - ret void -} - -define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 { -; ALL-LABEL: trunc_db_128: -; ALL: ## %bb.0: -; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; ALL-NEXT: retq - %x = trunc <4 x i32> %i to <4 x i8> - ret <4 x i8> %x -} - -define void @trunc_db_128_mem(<4 x i32> %i, <4 x i8>* %res) #0 { -; KNL-LABEL: trunc_db_128_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; KNL-NEXT: vmovd %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_db_128_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovdb %xmm0, (%rdi) -; SKX-NEXT: retq - %x = trunc <4 x i32> %i to <4 x i8> - store <4 x i8> %x, <4 x i8>* %res - ret void -} - -define <16 x i16> @trunc_dw_512(<16 x i32> %i) #0 { -; ALL-LABEL: trunc_dw_512: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovdw %zmm0, %ymm0 -; ALL-NEXT: retq - %x = trunc <16 x i32> %i to <16 x i16> - ret <16 x i16> %x -} - -define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 { -; ALL-LABEL: trunc_dw_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovdw %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x = trunc <16 x i32> %i to <16 x i16> - store <16 x i16> %x, <16 x i16>* %res - ret void -} - -define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 { -; KNL-LABEL: trunc_dw_256: -; KNL: ## %bb.0: -; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0 -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_dw_256: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovdw %ymm0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <8 x i32> %i to <8 x i16> - ret <8 x i16> %x -} - -define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 { -; KNL-LABEL: trunc_dw_256_mem: -; KNL: ## %bb.0: -; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; KNL-NEXT: vpmovdw %zmm0, %ymm0 -; KNL-NEXT: vmovdqa %xmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_dw_256_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovdw %ymm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <8 x i32> %i to <8 x i16> - store <8 x i16> %x, <8 x i16>* %res - ret void -} - -define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 { -; KNL-LABEL: trunc_dw_128_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; KNL-NEXT: vmovq %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_dw_128_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovdw %xmm0, (%rdi) -; SKX-NEXT: retq - %x = trunc <4 x i32> %i to <4 x i16> - store <4 x i16> %x, <4 x i16>* %res - ret void -} - -define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 { -; KNL-LABEL: trunc_wb_512: -; KNL: ## %bb.0: -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; KNL-NEXT: vpmovdb %zmm1, %xmm1 -; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_wb_512: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovwb %zmm0, %ymm0 -; SKX-NEXT: retq - %x = trunc <32 x i16> %i to <32 x i8> - ret <32 x i8> %x -} - -define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 { -; KNL-LABEL: trunc_wb_512_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; KNL-NEXT: vpmovdb %zmm1, 16(%rdi) -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_wb_512_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovwb %zmm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <32 x i16> %i to <32 x i8> - store <32 x i8> %x, <32 x i8>* %res - ret void -} - -define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 { -; KNL-LABEL: trunc_wb_256: -; KNL: ## %bb.0: -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_wb_256: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovwb %ymm0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <16 x i16> %i to <16 x i8> - ret <16 x i8> %x -} - -define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 { -; KNL-LABEL: trunc_wb_256_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_wb_256_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovwb %ymm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x = trunc <16 x i16> %i to <16 x i8> - store <16 x i8> %x, <16 x i8>* %res - ret void -} - -define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 { -; ALL-LABEL: trunc_wb_128: -; ALL: ## %bb.0: -; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; ALL-NEXT: retq - %x = trunc <8 x i16> %i to <8 x i8> - ret <8 x i8> %x -} - -define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 { -; KNL-LABEL: trunc_wb_128_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; KNL-NEXT: vmovq %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: trunc_wb_128_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovwb %xmm0, (%rdi) -; SKX-NEXT: retq - %x = trunc <8 x i16> %i to <8 x i8> - store <8 x i8> %x, <8 x i8>* %res - ret void -} - - -define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { -; KNL-LABEL: usat_trunc_wb_256_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: usat_trunc_wb_256_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovuswb %ymm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x3 = icmp ult <16 x i16> %i, - %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> - %x6 = trunc <16 x i16> %x5 to <16 x i8> - store <16 x i8> %x6, <16 x i8>* %res, align 1 - ret void -} - -define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) { -; KNL-LABEL: usat_trunc_wb_256: -; KNL: ## %bb.0: -; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: usat_trunc_wb_256: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovuswb %ymm0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x3 = icmp ult <16 x i16> %i, - %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> - %x6 = trunc <16 x i16> %x5 to <16 x i8> - ret <16 x i8> %x6 -} - -define void @usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) { -; KNL-LABEL: usat_trunc_wb_128_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 -; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: usat_trunc_wb_128_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmovuswb %xmm0, (%rdi) -; SKX-NEXT: retq - %x3 = icmp ult <8 x i16> %i, - %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> - %x6 = trunc <8 x i16> %x5 to <8 x i8> - store <8 x i8> %x6, <8 x i8>* %res, align 1 - ret void -} - -define void @usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) { -; ALL-LABEL: usat_trunc_db_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovusdb %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x3 = icmp ult <16 x i32> %i, - %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> - %x6 = trunc <16 x i32> %x5 to <16 x i8> - store <16 x i8> %x6, <16 x i8>* %res, align 1 - ret void -} - -define void @usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) { -; ALL-LABEL: usat_trunc_qb_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovusqb %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x3 = icmp ult <8 x i64> %i, - %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> - %x6 = trunc <8 x i64> %x5 to <8 x i8> - store <8 x i8> %x6, <8 x i8>* %res, align 1 - ret void -} - -define void @usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) { -; ALL-LABEL: usat_trunc_qd_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovusqd %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x3 = icmp ult <8 x i64> %i, - %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> - %x6 = trunc <8 x i64> %x5 to <8 x i32> - store <8 x i32> %x6, <8 x i32>* %res, align 1 - ret void -} - -define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) { -; ALL-LABEL: usat_trunc_qw_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovusqw %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x3 = icmp ult <8 x i64> %i, - %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> - %x6 = trunc <8 x i64> %x5 to <8 x i16> - store <8 x i16> %x6, <8 x i16>* %res, align 1 - ret void -} - -define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) { -; ALL-LABEL: usat_trunc_db_1024: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovusdb %zmm0, %xmm0 -; ALL-NEXT: vpmovusdb %zmm1, %xmm1 -; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq - %x3 = icmp ult <32 x i32> %i, - %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> - %x6 = trunc <32 x i32> %x5 to <32 x i8> - ret <32 x i8> %x6 -} - -define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) { -; ALL-LABEL: usat_trunc_db_1024_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovusdb %zmm0, %xmm0 -; ALL-NEXT: vpmovusdb %zmm1, %xmm1 -; ALL-NEXT: vmovdqu %xmm1, 16(%rdi) -; ALL-NEXT: vmovdqu %xmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x3 = icmp ult <32 x i32> %i, - %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> - %x6 = trunc <32 x i32> %x5 to <32 x i8> - store <32 x i8>%x6, <32 x i8>* %p, align 1 - ret void -} - -define <16 x i16> @usat_trunc_dw_512(<16 x i32> %i) { -; ALL-LABEL: usat_trunc_dw_512: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovusdw %zmm0, %ymm0 -; ALL-NEXT: retq - %x3 = icmp ult <16 x i32> %i, - %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> - %x6 = trunc <16 x i32> %x5 to <16 x i16> - ret <16 x i16> %x6 -} - -define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) { -; ALL-LABEL: usat_trunc_wb_128: -; ALL: ## %bb.0: -; ALL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0 -; ALL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; ALL-NEXT: retq - %x3 = icmp ult <8 x i16> %i, - %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> - %x6 = trunc <8 x i16> %x5 to <8 x i8> - ret <8 x i8>%x6 -} - -define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) { -; ALL-LABEL: usat_trunc_qw_1024: -; ALL: ## %bb.0: -; ALL-NEXT: vpmovusqw %zmm0, %xmm0 -; ALL-NEXT: vpmovusqw %zmm1, %xmm1 -; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq - %x3 = icmp ult <16 x i64> %i, - %x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> - %x6 = trunc <16 x i64> %x5 to <16 x i16> - ret <16 x i16> %x6 -} - -define <16 x i8> @usat_trunc_db_256(<8 x i32> %x) { -; KNL-LABEL: usat_trunc_db_256: -; KNL: ## %bb.0: -; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; KNL-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: usat_trunc_db_256: -; SKX: ## %bb.0: -; SKX-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; SKX-NEXT: vpmovdb %ymm0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %tmp1 = icmp ult <8 x i32> %x, - %tmp2 = select <8 x i1> %tmp1, <8 x i32> %x, <8 x i32> - %tmp3 = trunc <8 x i32> %tmp2 to <8 x i8> - %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> - ret <16 x i8> %tmp4 -} - - - -; Tests for the following unsigned saturation pattern: - -; %a = icmp sgt %x, C1 -; %b = select %a, %x, C2 -; %c = icmp slt %b, C2 -; %d = select %c, %b, C2 -; %res = trunc %d - - -define void @smax_usat_trunc_wb_256_mem1(<16 x i16> %i, <16 x i8>* %res) { -; KNL-LABEL: smax_usat_trunc_wb_256_mem1: -; KNL: ## %bb.0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: smax_usat_trunc_wb_256_mem1: -; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovuswb %ymm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x1 = icmp sgt <16 x i16> %i, - %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> - %x3 = icmp slt <16 x i16> %x2, - %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> - %x6 = trunc <16 x i16> %x5 to <16 x i8> - store <16 x i8> %x6, <16 x i8>* %res, align 1 - ret void -} - -; Test for smax(smin(x, C2), C1). -define void @smax_usat_trunc_wb_256_mem2(<16 x i16> %i, <16 x i8>* %res) { -; KNL-LABEL: smax_usat_trunc_wb_256_mem2: -; KNL: ## %bb.0: -; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: smax_usat_trunc_wb_256_mem2: -; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovuswb %ymm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x1 = icmp slt <16 x i16> %i, - %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> - %x3 = icmp sgt <16 x i16> %x2, - %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> - %x6 = trunc <16 x i16> %x5 to <16 x i8> - store <16 x i8> %x6, <16 x i8>* %res, align 1 - ret void -} - -define <16 x i8> @smax_usat_trunc_wb_256(<16 x i16> %i) { -; KNL-LABEL: smax_usat_trunc_wb_256: -; KNL: ## %bb.0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, %xmm0 -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: smax_usat_trunc_wb_256: -; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovuswb %ymm0, %xmm0 -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x1 = icmp sgt <16 x i16> %i, - %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> - %x3 = icmp slt <16 x i16> %x2, - %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> - %x6 = trunc <16 x i16> %x5 to <16 x i8> - ret <16 x i8> %x6 - } - -define void @smax_usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) { -; KNL-LABEL: smax_usat_trunc_wb_128_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; KNL-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0 -; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; KNL-NEXT: vmovq %xmm0, (%rdi) -; KNL-NEXT: retq -; -; SKX-LABEL: smax_usat_trunc_wb_128_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovuswb %xmm0, (%rdi) -; SKX-NEXT: retq - %x1 = icmp sgt <8 x i16> %i, - %x2 = select <8 x i1> %x1, <8 x i16> %i, <8 x i16> - %x3 = icmp slt <8 x i16> %x2, - %x5 = select <8 x i1> %x3, <8 x i16> %x2, <8 x i16> - %x6 = trunc <8 x i16> %x5 to <8 x i8> - store <8 x i8> %x6, <8 x i8>* %res, align 1 - ret void -} - -define void @smax_usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) { -; ALL-LABEL: smax_usat_trunc_db_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpmovusdb %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x1 = icmp sgt <16 x i32> %i, - %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> - %x3 = icmp slt <16 x i32> %x2, - %x5 = select <16 x i1> %x3, <16 x i32> %x2, <16 x i32> - %x6 = trunc <16 x i32> %x5 to <16 x i8> - store <16 x i8> %x6, <16 x i8>* %res, align 1 - ret void -} - -define void @smax_usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) { -; ALL-LABEL: smax_usat_trunc_qb_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpmovusqb %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x1 = icmp sgt <8 x i64> %i, - %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> - %x3 = icmp slt <8 x i64> %x2, - %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> - %x6 = trunc <8 x i64> %x5 to <8 x i8> - store <8 x i8> %x6, <8 x i8>* %res, align 1 - ret void -} - -define void @smax_usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) { -; ALL-LABEL: smax_usat_trunc_qd_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpmovusqd %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x1 = icmp sgt <8 x i64> %i, - %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> - %x3 = icmp slt <8 x i64> %x2, - %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> - %x6 = trunc <8 x i64> %x5 to <8 x i32> - store <8 x i32> %x6, <8 x i32>* %res, align 1 - ret void -} - -define void @smax_usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) { -; ALL-LABEL: smax_usat_trunc_qw_512_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpmovusqw %zmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x1 = icmp sgt <8 x i64> %i, - %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> - %x3 = icmp slt <8 x i64> %x2, - %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> - %x6 = trunc <8 x i64> %x5 to <8 x i16> - store <8 x i16> %x6, <8 x i16>* %res, align 1 - ret void -} - -define <32 x i8> @smax_usat_trunc_db_1024(<32 x i32> %i) { -; ALL-LABEL: smax_usat_trunc_db_1024: -; ALL: ## %bb.0: -; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 -; ALL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 -; ALL-NEXT: vpmovusdb %zmm0, %xmm0 -; ALL-NEXT: vpmovusdb %zmm1, %xmm1 -; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: retq - %x1 = icmp sgt <32 x i32> %i, - %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> - %x3 = icmp slt <32 x i32> %x2, - %x5 = select <32 x i1> %x3, <32 x i32> %x2, <32 x i32> - %x6 = trunc <32 x i32> %x5 to <32 x i8> - ret <32 x i8> %x6 -} - -define void @smax_usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) { -; ALL-LABEL: smax_usat_trunc_db_1024_mem: -; ALL: ## %bb.0: -; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; ALL-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 -; ALL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 -; ALL-NEXT: vpmovusdb %zmm0, %xmm0 -; ALL-NEXT: vpmovusdb %zmm1, %xmm1 -; ALL-NEXT: vmovdqu %xmm1, 16(%rdi) -; ALL-NEXT: vmovdqu %xmm0, (%rdi) -; ALL-NEXT: vzeroupper -; ALL-NEXT: retq - %x1 = icmp sgt <32 x i32> %i, - %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> - %x3 = icmp slt <32 x i32> %x2, - %x5 = select <32 x i1> %x3, <32 x i32> %x2, <32 x i32> - %x6 = trunc <32 x i32> %x5 to <32 x i8> - store <32 x i8>%x6, <32 x i8>* %p, align 1 - ret void -} - -define <16 x i16> @smax_usat_trunc_dw_512(<16 x i32> %i) { -; ALL-LABEL: smax_usat_trunc_dw_512: -; ALL: ## %bb.0: -; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; ALL-NEXT: vpmovusdw %zmm0, %ymm0 -; ALL-NEXT: retq - %x1 = icmp sgt <16 x i32> %i, - %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> - %x3 = icmp slt <16 x i32> %x2, - %x5 = select <16 x i1> %x3, <16 x i32> %x2, <16 x i32> - %x6 = trunc <16 x i32> %x5 to <16 x i16> - ret <16 x i16> %x6 -} - -define void @negative_test1_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { -; KNL-LABEL: negative_test1_smax_usat_trunc_wb_256_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; KNL-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: negative_test1_smax_usat_trunc_wb_256_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; SKX-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; SKX-NEXT: vpmovwb %ymm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x1 = icmp sgt <16 x i16> %i, - %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> - %x3 = icmp slt <16 x i16> %x2, - %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> - %x6 = trunc <16 x i16> %x5 to <16 x i8> - store <16 x i8> %x6, <16 x i8>* %res, align 1 - ret void -} - -define void @negative_test2_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) { -; KNL-LABEL: negative_test2_smax_usat_trunc_wb_256_mem: -; KNL: ## %bb.0: -; KNL-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; KNL-NEXT: vpmovdb %zmm0, (%rdi) -; KNL-NEXT: vzeroupper -; KNL-NEXT: retq -; -; SKX-LABEL: negative_test2_smax_usat_trunc_wb_256_mem: -; SKX: ## %bb.0: -; SKX-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; SKX-NEXT: vpmovwb %ymm0, (%rdi) -; SKX-NEXT: vzeroupper -; SKX-NEXT: retq - %x1 = icmp sgt <16 x i16> %i, - %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> - %x3 = icmp slt <16 x i16> %x2, - %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> - %x6 = trunc <16 x i16> %x5 to <16 x i8> - store <16 x i8> %x6, <16 x i8>* %res, align 1 - ret void -} diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll index f6473f8fc2f..4bb2b764ccc 100644 --- a/test/CodeGen/X86/bswap-vector.ll +++ b/test/CodeGen/X86/bswap-vector.ll @@ -2,7 +2,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-SSE --check-prefix=CHECK-NOSSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-SSE --check-prefix=CHECK-SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-AVX --check-prefix=CHECK-AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-WIDE-AVX --check-prefix=CHECK-WIDE-AVX2 declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>) declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) @@ -31,11 +30,6 @@ define <8 x i16> @test1(<8 x i16> %v) { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: test1: -; CHECK-WIDE-AVX: # %bb.0: # %entry -; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v) ret <8 x i16> %r @@ -64,11 +58,6 @@ define <4 x i32> @test2(<4 x i32> %v) { ; CHECK-AVX: # %bb.0: ; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: test2: -; CHECK-WIDE-AVX: # %bb.0: -; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; CHECK-WIDE-AVX-NEXT: retq %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v) ret <4 x i32> %r } @@ -99,12 +88,6 @@ define <4 x i32> @or_bswap(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %p1, <4 x i32> ; CHECK-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: or_bswap: -; CHECK-WIDE-AVX: # %bb.0: -; CHECK-WIDE-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] -; CHECK-WIDE-AVX-NEXT: retq %xt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %x) %yt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %y) %r = or <4 x i32> %xt, %yt @@ -136,11 +119,6 @@ define <2 x i64> @test3(<2 x i64> %v) { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: test3: -; CHECK-WIDE-AVX: # %bb.0: # %entry -; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8] -; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v) ret <2 x i64> %r @@ -183,11 +161,6 @@ define <16 x i16> @test4(<16 x i16> %v) { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: test4: -; CHECK-WIDE-AVX: # %bb.0: # %entry -; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30] -; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v) ret <16 x i16> %r @@ -226,11 +199,6 @@ define <8 x i32> @test5(<8 x i32> %v) { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: test5: -; CHECK-WIDE-AVX: # %bb.0: # %entry -; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28] -; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v) ret <8 x i32> %r @@ -273,11 +241,6 @@ define <4 x i64> @test6(<4 x i64> %v) { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: test6: -; CHECK-WIDE-AVX: # %bb.0: # %entry -; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24] -; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v) ret <4 x i64> %r @@ -308,11 +271,6 @@ define <4 x i16> @test7(<4 x i16> %v) { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: test7: -; CHECK-WIDE-AVX: # %bb.0: # %entry -; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v) ret <4 x i16> %r @@ -406,11 +364,6 @@ define <8 x i16> @fold_v8i16() { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: fold_v8i16: -; CHECK-WIDE-AVX: # %bb.0: # %entry -; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536] -; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> ) ret <8 x i16> %r @@ -426,11 +379,6 @@ define <4 x i32> @fold_v4i32() { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: fold_v4i32: -; CHECK-WIDE-AVX: # %bb.0: # %entry -; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863] -; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> ) ret <4 x i32> %r @@ -446,11 +394,6 @@ define <2 x i64> @fold_v2i64() { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: fold_v2i64: -; CHECK-WIDE-AVX: # %bb.0: # %entry -; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615] -; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> ) ret <2 x i64> %r @@ -467,11 +410,6 @@ define <16 x i16> @fold_v16i16() { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: fold_v16i16: -; CHECK-WIDE-AVX: # %bb.0: # %entry -; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584] -; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> ) ret <16 x i16> %r @@ -488,11 +426,6 @@ define <8 x i32> @fold_v8i32() { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: fold_v8i32: -; CHECK-WIDE-AVX: # %bb.0: # %entry -; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296] -; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> ) ret <8 x i32> %r @@ -509,11 +442,6 @@ define <4 x i64> @fold_v4i64() { ; CHECK-AVX: # %bb.0: # %entry ; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160] ; CHECK-AVX-NEXT: retq -; -; CHECK-WIDE-AVX-LABEL: fold_v4i64: -; CHECK-WIDE-AVX: # %bb.0: # %entry -; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160] -; CHECK-WIDE-AVX-NEXT: retq entry: %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> ) ret <4 x i64> %r diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll index e31c9f8967c..d41d2ea83bc 100644 --- a/test/CodeGen/X86/lower-bitcast.ll +++ b/test/CodeGen/X86/lower-bitcast.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=core2 -mattr=+sse2 | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=core2 -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE ; FIXME: Ideally we should be able to fold the entire body of @test1 into a ; single paddd instruction. At the moment we produce the sequence @@ -11,11 +10,6 @@ define double @test1(double %A) { ; CHECK: # %bb.0: ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq -; -; CHECK-WIDE-LABEL: test1: -; CHECK-WIDE: # %bb.0: -; CHECK-WIDE-NEXT: paddd {{.*}}(%rip), %xmm0 -; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <2 x i32> %add = add <2 x i32> %1, %2 = bitcast <2 x i32> %add to double @@ -27,11 +21,6 @@ define double @test2(double %A, double %B) { ; CHECK: # %bb.0: ; CHECK-NEXT: paddd %xmm1, %xmm0 ; CHECK-NEXT: retq -; -; CHECK-WIDE-LABEL: test2: -; CHECK-WIDE: # %bb.0: -; CHECK-WIDE-NEXT: paddd %xmm1, %xmm0 -; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <2 x i32> %2 = bitcast double %B to <2 x i32> %add = add <2 x i32> %1, %2 @@ -46,13 +35,6 @@ define i64 @test3(i64 %A) { ; CHECK-NEXT: addps {{.*}}(%rip), %xmm0 ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq -; -; CHECK-WIDE-LABEL: test3: -; CHECK-WIDE: # %bb.0: -; CHECK-WIDE-NEXT: movq %rdi, %xmm0 -; CHECK-WIDE-NEXT: addps {{.*}}(%rip), %xmm0 -; CHECK-WIDE-NEXT: movq %xmm0, %rax -; CHECK-WIDE-NEXT: retq %1 = bitcast i64 %A to <2 x float> %add = fadd <2 x float> %1, %2 = bitcast <2 x float> %add to i64 @@ -69,13 +51,6 @@ define i64 @test4(i64 %A) { ; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0 ; CHECK-NEXT: movq %xmm0, %rax ; CHECK-NEXT: retq -; -; CHECK-WIDE-LABEL: test4: -; CHECK-WIDE: # %bb.0: -; CHECK-WIDE-NEXT: movq %rdi, %xmm0 -; CHECK-WIDE-NEXT: paddd {{.*}}(%rip), %xmm0 -; CHECK-WIDE-NEXT: movq %xmm0, %rax -; CHECK-WIDE-NEXT: retq %1 = bitcast i64 %A to <2 x i32> %add = add <2 x i32> %1, %2 = bitcast <2 x i32> %add to i64 @@ -87,11 +62,6 @@ define double @test5(double %A) { ; CHECK: # %bb.0: ; CHECK-NEXT: addps {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq -; -; CHECK-WIDE-LABEL: test5: -; CHECK-WIDE: # %bb.0: -; CHECK-WIDE-NEXT: addps {{.*}}(%rip), %xmm0 -; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <2 x float> %add = fadd <2 x float> %1, %2 = bitcast <2 x float> %add to double @@ -106,11 +76,6 @@ define double @test6(double %A) { ; CHECK: # %bb.0: ; CHECK-NEXT: paddw {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq -; -; CHECK-WIDE-LABEL: test6: -; CHECK-WIDE: # %bb.0: -; CHECK-WIDE-NEXT: paddw {{.*}}(%rip), %xmm0 -; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <4 x i16> %add = add <4 x i16> %1, %2 = bitcast <4 x i16> %add to double @@ -122,11 +87,6 @@ define double @test7(double %A, double %B) { ; CHECK: # %bb.0: ; CHECK-NEXT: paddw %xmm1, %xmm0 ; CHECK-NEXT: retq -; -; CHECK-WIDE-LABEL: test7: -; CHECK-WIDE: # %bb.0: -; CHECK-WIDE-NEXT: paddw %xmm1, %xmm0 -; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <4 x i16> %2 = bitcast double %B to <4 x i16> %add = add <4 x i16> %1, %2 @@ -143,11 +103,6 @@ define double @test8(double %A) { ; CHECK: # %bb.0: ; CHECK-NEXT: paddb {{.*}}(%rip), %xmm0 ; CHECK-NEXT: retq -; -; CHECK-WIDE-LABEL: test8: -; CHECK-WIDE: # %bb.0: -; CHECK-WIDE-NEXT: paddb {{.*}}(%rip), %xmm0 -; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <8 x i8> %add = add <8 x i8> %1, %2 = bitcast <8 x i8> %add to double @@ -159,11 +114,6 @@ define double @test9(double %A, double %B) { ; CHECK: # %bb.0: ; CHECK-NEXT: paddb %xmm1, %xmm0 ; CHECK-NEXT: retq -; -; CHECK-WIDE-LABEL: test9: -; CHECK-WIDE: # %bb.0: -; CHECK-WIDE-NEXT: paddb %xmm1, %xmm0 -; CHECK-WIDE-NEXT: retq %1 = bitcast double %A to <8 x i8> %2 = bitcast double %B to <8 x i8> %add = add <8 x i8> %1, %2 diff --git a/test/CodeGen/X86/masked_gather_scatter_widen.ll b/test/CodeGen/X86/masked_gather_scatter_widen.ll index fc9aa62b948..fc6341640c4 100644 --- a/test/CodeGen/X86/masked_gather_scatter_widen.ll +++ b/test/CodeGen/X86/masked_gather_scatter_widen.ll @@ -1,10 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -x86-experimental-vector-widening-legalization < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_SKX -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -x86-experimental-vector-widening-legalization < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_KNL -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=CHECK --check-prefix=PROMOTE --check-prefix=PROMOTE_SKX -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=CHECK --check-prefix=PROMOTE --check-prefix=PROMOTE_KNL -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -x86-experimental-vector-widening-legalization < %s | FileCheck %s --check-prefix=WIDEN_AVX2 -; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s --check-prefix=PROMOTE_AVX2 +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_SKX +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_KNL +; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s --check-prefix=WIDEN_AVX2 define <2 x double> @test_gather_v2i32_index(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) { ; WIDEN_SKX-LABEL: test_gather_v2i32_index: @@ -28,40 +25,12 @@ define <2 x double> @test_gather_v2i32_index(double* %base, <2 x i32> %ind, <2 x ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; -; PROMOTE_SKX-LABEL: test_gather_v2i32_index: -; PROMOTE_SKX: # %bb.0: -; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1 -; PROMOTE_SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1} -; PROMOTE_SKX-NEXT: vmovapd %xmm2, %xmm0 -; PROMOTE_SKX-NEXT: retq -; -; PROMOTE_KNL-LABEL: test_gather_v2i32_index: -; PROMOTE_KNL: # %bb.0: -; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 -; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 -; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1} -; PROMOTE_KNL-NEXT: vmovapd %xmm2, %xmm0 -; PROMOTE_KNL-NEXT: vzeroupper -; PROMOTE_KNL-NEXT: retq -; ; WIDEN_AVX2-LABEL: test_gather_v2i32_index: ; WIDEN_AVX2: # %bb.0: ; WIDEN_AVX2-NEXT: vpsllq $63, %xmm1, %xmm1 ; WIDEN_AVX2-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2 ; WIDEN_AVX2-NEXT: vmovapd %xmm2, %xmm0 ; WIDEN_AVX2-NEXT: retq -; -; PROMOTE_AVX2-LABEL: test_gather_v2i32_index: -; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm1, %xmm1 -; PROMOTE_AVX2-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2 -; PROMOTE_AVX2-NEXT: vmovapd %xmm2, %xmm0 -; PROMOTE_AVX2-NEXT: retq %gep.random = getelementptr double, double* %base, <2 x i32> %ind %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0) ret <2 x double> %res @@ -87,25 +56,6 @@ define void @test_scatter_v2i32_index(<2 x double> %a1, double* %base, <2 x i32> ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; -; PROMOTE_SKX-LABEL: test_scatter_v2i32_index: -; PROMOTE_SKX: # %bb.0: -; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1 -; PROMOTE_SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1} -; PROMOTE_SKX-NEXT: retq -; -; PROMOTE_KNL-LABEL: test_scatter_v2i32_index: -; PROMOTE_KNL: # %bb.0: -; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 -; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 -; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 -; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 -; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1} -; PROMOTE_KNL-NEXT: vzeroupper -; PROMOTE_KNL-NEXT: retq -; ; WIDEN_AVX2-LABEL: test_scatter_v2i32_index: ; WIDEN_AVX2: # %bb.0: ; WIDEN_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 @@ -131,32 +81,6 @@ define void @test_scatter_v2i32_index(<2 x double> %a1, double* %base, <2 x i32> ; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax ; WIDEN_AVX2-NEXT: vmovhps %xmm0, (%rax) ; WIDEN_AVX2-NEXT: retq -; -; PROMOTE_AVX2-LABEL: test_scatter_v2i32_index: -; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 -; PROMOTE_AVX2-NEXT: vpsllq $3, %xmm1, %xmm1 -; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3 -; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 -; PROMOTE_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 -; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax -; PROMOTE_AVX2-NEXT: testb $1, %al -; PROMOTE_AVX2-NEXT: jne .LBB1_1 -; PROMOTE_AVX2-NEXT: # %bb.2: # %else -; PROMOTE_AVX2-NEXT: testb $2, %al -; PROMOTE_AVX2-NEXT: jne .LBB1_3 -; PROMOTE_AVX2-NEXT: .LBB1_4: # %else2 -; PROMOTE_AVX2-NEXT: retq -; PROMOTE_AVX2-NEXT: .LBB1_1: # %cond.store -; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx -; PROMOTE_AVX2-NEXT: vmovlps %xmm0, (%rcx) -; PROMOTE_AVX2-NEXT: testb $2, %al -; PROMOTE_AVX2-NEXT: je .LBB1_4 -; PROMOTE_AVX2-NEXT: .LBB1_3: # %cond.store1 -; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax -; PROMOTE_AVX2-NEXT: vmovhps %xmm0, (%rax) -; PROMOTE_AVX2-NEXT: retq %gep = getelementptr double, double *%base, <2 x i32> %ind call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask) ret void @@ -184,27 +108,6 @@ define <2 x i32> @test_gather_v2i32_data(<2 x i32*> %ptr, <2 x i1> %mask, <2 x i ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; -; PROMOTE_SKX-LABEL: test_gather_v2i32_data: -; PROMOTE_SKX: # %bb.0: -; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1 -; PROMOTE_SKX-NEXT: vpgatherqd (,%xmm0), %xmm2 {%k1} -; PROMOTE_SKX-NEXT: vmovdqa %xmm2, %xmm0 -; PROMOTE_SKX-NEXT: retq -; -; PROMOTE_KNL-LABEL: test_gather_v2i32_data: -; PROMOTE_KNL: # %bb.0: -; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 -; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 -; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 -; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1} -; PROMOTE_KNL-NEXT: vmovdqa %xmm2, %xmm0 -; PROMOTE_KNL-NEXT: vzeroupper -; PROMOTE_KNL-NEXT: retq -; ; WIDEN_AVX2-LABEL: test_gather_v2i32_data: ; WIDEN_AVX2: # %bb.0: ; WIDEN_AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] @@ -212,14 +115,6 @@ define <2 x i32> @test_gather_v2i32_data(<2 x i32*> %ptr, <2 x i1> %mask, <2 x i ; WIDEN_AVX2-NEXT: vpgatherqd %xmm1, (,%xmm0), %xmm2 ; WIDEN_AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; WIDEN_AVX2-NEXT: retq -; -; PROMOTE_AVX2-LABEL: test_gather_v2i32_data: -; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; PROMOTE_AVX2-NEXT: vpslld $31, %xmm1, %xmm1 -; PROMOTE_AVX2-NEXT: vpgatherqd %xmm1, (,%xmm0), %xmm2 -; PROMOTE_AVX2-NEXT: vmovdqa %xmm2, %xmm0 -; PROMOTE_AVX2-NEXT: retq %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptr, i32 4, <2 x i1> %mask, <2 x i32> %src0) ret <2 x i32>%res } @@ -244,25 +139,6 @@ define void @test_scatter_v2i32_data(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mas ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; -; PROMOTE_SKX-LABEL: test_scatter_v2i32_data: -; PROMOTE_SKX: # %bb.0: -; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1 -; PROMOTE_SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1} -; PROMOTE_SKX-NEXT: retq -; -; PROMOTE_KNL-LABEL: test_scatter_v2i32_data: -; PROMOTE_KNL: # %bb.0: -; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 -; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 -; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 -; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} -; PROMOTE_KNL-NEXT: vzeroupper -; PROMOTE_KNL-NEXT: retq -; ; WIDEN_AVX2-LABEL: test_scatter_v2i32_data: ; WIDEN_AVX2: # %bb.0: ; WIDEN_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 @@ -283,27 +159,6 @@ define void @test_scatter_v2i32_data(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mas ; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax ; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax) ; WIDEN_AVX2-NEXT: retq -; -; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data: -; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 -; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax -; PROMOTE_AVX2-NEXT: testb $1, %al -; PROMOTE_AVX2-NEXT: jne .LBB3_1 -; PROMOTE_AVX2-NEXT: # %bb.2: # %else -; PROMOTE_AVX2-NEXT: testb $2, %al -; PROMOTE_AVX2-NEXT: jne .LBB3_3 -; PROMOTE_AVX2-NEXT: .LBB3_4: # %else2 -; PROMOTE_AVX2-NEXT: retq -; PROMOTE_AVX2-NEXT: .LBB3_1: # %cond.store -; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx -; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rcx) -; PROMOTE_AVX2-NEXT: testb $2, %al -; PROMOTE_AVX2-NEXT: je .LBB3_4 -; PROMOTE_AVX2-NEXT: .LBB3_3: # %cond.store1 -; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax -; PROMOTE_AVX2-NEXT: vextractps $1, %xmm0, (%rax) -; PROMOTE_AVX2-NEXT: retq call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask) ret void } @@ -330,27 +185,6 @@ define <2 x i32> @test_gather_v2i32_data_index(i32* %base, <2 x i32> %ind, <2 x ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; -; PROMOTE_SKX-LABEL: test_gather_v2i32_data_index: -; PROMOTE_SKX: # %bb.0: -; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 -; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1 -; PROMOTE_SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1} -; PROMOTE_SKX-NEXT: vmovdqa %xmm2, %xmm0 -; PROMOTE_SKX-NEXT: retq -; -; PROMOTE_KNL-LABEL: test_gather_v2i32_data_index: -; PROMOTE_KNL: # %bb.0: -; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 -; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 -; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 -; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 -; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} -; PROMOTE_KNL-NEXT: vmovdqa %xmm2, %xmm0 -; PROMOTE_KNL-NEXT: vzeroupper -; PROMOTE_KNL-NEXT: retq -; ; WIDEN_AVX2-LABEL: test_gather_v2i32_data_index: ; WIDEN_AVX2: # %bb.0: ; WIDEN_AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero @@ -358,14 +192,6 @@ define <2 x i32> @test_gather_v2i32_data_index(i32* %base, <2 x i32> %ind, <2 x ; WIDEN_AVX2-NEXT: vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2 ; WIDEN_AVX2-NEXT: vmovdqa %xmm2, %xmm0 ; WIDEN_AVX2-NEXT: retq -; -; PROMOTE_AVX2-LABEL: test_gather_v2i32_data_index: -; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero -; PROMOTE_AVX2-NEXT: vpslld $31, %xmm1, %xmm1 -; PROMOTE_AVX2-NEXT: vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2 -; PROMOTE_AVX2-NEXT: vmovdqa %xmm2, %xmm0 -; PROMOTE_AVX2-NEXT: retq %gep.random = getelementptr i32, i32* %base, <2 x i32> %ind %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) ret <2 x i32> %res @@ -391,25 +217,6 @@ define void @test_scatter_v2i32_data_index(<2 x i32> %a1, i32* %base, <2 x i32> ; WIDEN_KNL-NEXT: vzeroupper ; WIDEN_KNL-NEXT: retq ; -; PROMOTE_SKX-LABEL: test_scatter_v2i32_data_index: -; PROMOTE_SKX: # %bb.0: -; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 -; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1 -; PROMOTE_SKX-NEXT: vpscatterdd %xmm0, (%rdi,%xmm1,4) {%k1} -; PROMOTE_SKX-NEXT: retq -; -; PROMOTE_KNL-LABEL: test_scatter_v2i32_data_index: -; PROMOTE_KNL: # %bb.0: -; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 -; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 -; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0 -; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1 -; PROMOTE_KNL-NEXT: vpscatterdd %zmm0, (%rdi,%zmm1,4) {%k1} -; PROMOTE_KNL-NEXT: vzeroupper -; PROMOTE_KNL-NEXT: retq -; ; WIDEN_AVX2-LABEL: test_scatter_v2i32_data_index: ; WIDEN_AVX2: # %bb.0: ; WIDEN_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 @@ -435,32 +242,6 @@ define void @test_scatter_v2i32_data_index(<2 x i32> %a1, i32* %base, <2 x i32> ; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax ; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax) ; WIDEN_AVX2-NEXT: retq -; -; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data_index: -; PROMOTE_AVX2: # %bb.0: -; PROMOTE_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 -; PROMOTE_AVX2-NEXT: vpsllq $2, %xmm1, %xmm1 -; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3 -; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 -; PROMOTE_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 -; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax -; PROMOTE_AVX2-NEXT: testb $1, %al -; PROMOTE_AVX2-NEXT: jne .LBB5_1 -; PROMOTE_AVX2-NEXT: # %bb.2: # %else -; PROMOTE_AVX2-NEXT: testb $2, %al -; PROMOTE_AVX2-NEXT: jne .LBB5_3 -; PROMOTE_AVX2-NEXT: .LBB5_4: # %else2 -; PROMOTE_AVX2-NEXT: retq -; PROMOTE_AVX2-NEXT: .LBB5_1: # %cond.store -; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx -; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rcx) -; PROMOTE_AVX2-NEXT: testb $2, %al -; PROMOTE_AVX2-NEXT: je .LBB5_4 -; PROMOTE_AVX2-NEXT: .LBB5_3: # %cond.store1 -; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax -; PROMOTE_AVX2-NEXT: vextractps $1, %xmm0, (%rax) -; PROMOTE_AVX2-NEXT: retq %gep = getelementptr i32, i32 *%base, <2 x i32> %ind call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %gep, i32 4, <2 x i1> %mask) ret void diff --git a/test/CodeGen/X86/pmulh.ll b/test/CodeGen/X86/pmulh.ll index 269d0b51c7b..eef26dfa8c2 100644 --- a/test/CodeGen/X86/pmulh.ll +++ b/test/CodeGen/X86/pmulh.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-PROMOTE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-WIDEN ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-PROMOTE -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-WIDEN ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW diff --git a/test/CodeGen/X86/shrink_vmul-widen.ll b/test/CodeGen/X86/shrink_vmul-widen.ll deleted file mode 100644 index 75998580074..00000000000 --- a/test/CodeGen/X86/shrink_vmul-widen.ll +++ /dev/null @@ -1,2553 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 - -@c = external global i32*, align 8 - -; %val1 = load <2 x i8> -; %op1 = zext<2 x i32> %val1 -; %val2 = load <2 x i8> -; %op2 = zext<2 x i32> %val2 -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_2xi8: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx -; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax -; X86-SSE-NEXT: movd %eax, %xmm1 -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi8: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx -; X86-AVX-NEXT: vmovd %edx, %xmm0 -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax -; X86-AVX-NEXT: vmovd %eax, %xmm1 -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi8: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm1 -; X64-SSE-NEXT: pxor %xmm2, %xmm2 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi8: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i8>* - %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 - %tmp8 = zext <2 x i8> %wide.load to <2 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <2 x i8>* - %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 - %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val1 = load <4 x i8> -; %op1 = zext<4 x i32> %val1 -; %val2 = load <4 x i8> -; %op2 = zext<4 x i32> %val2 -; %rst = mul <4 x i32> %op1, %op2 -; -define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_4xi8: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_4xi8: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_4xi8: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-SSE-NEXT: pxor %xmm2, %xmm2 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_4xi8: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <4 x i8>* - %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1 - %tmp8 = zext <4 x i8> %wide.load to <4 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <4 x i8>* - %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1 - %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> - %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <4 x i32>* - store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 - ret void -} - -; %val1 = load <8 x i8> -; %op1 = zext<8 x i32> %val1 -; %val2 = load <8 x i8> -; %op2 = zext<8 x i32> %val2 -; %rst = mul <8 x i32> %op1, %op2 -; -define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_8xi8: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX1-LABEL: mul_8xi8: -; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX1-NEXT: movl c, %esi -; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 -; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 -; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) -; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: mul_8xi8: -; X86-AVX2: # %bb.0: # %entry -; X86-AVX2-NEXT: pushl %esi -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX2-NEXT: movl c, %esi -; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 -; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) -; X86-AVX2-NEXT: popl %esi -; X86-AVX2-NEXT: vzeroupper -; X86-AVX2-NEXT: retl -; -; X64-SSE-LABEL: mul_8xi8: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-SSE-NEXT: pxor %xmm2, %xmm2 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX1-LABEL: mul_8xi8: -; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: mul_8xi8: -; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 -; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <8 x i8>* - %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1 - %tmp8 = zext <8 x i8> %wide.load to <8 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <8 x i8>* - %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1 - %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> - %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <8 x i32>* - store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 - ret void -} - -; %val1 = load <16 x i8> -; %op1 = zext<16 x i32> %val1 -; %val2 = load <16 x i8> -; %op2 = zext<16 x i32> %val2 -; %rst = mul <16 x i32> %op1, %op2 -; -define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_16xi8: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X86-SSE-NEXT: movdqa %xmm1, %xmm4 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X86-SSE-NEXT: pmullw %xmm3, %xmm4 -; X86-SSE-NEXT: movdqa %xmm4, %xmm3 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX1-LABEL: mul_16xi8: -; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX1-NEXT: movl c, %esi -; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 -; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 -; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) -; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: mul_16xi8: -; X86-AVX2: # %bb.0: # %entry -; X86-AVX2-NEXT: pushl %esi -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX2-NEXT: movl c, %esi -; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 -; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) -; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) -; X86-AVX2-NEXT: popl %esi -; X86-AVX2-NEXT: vzeroupper -; X86-AVX2-NEXT: retl -; -; X64-SSE-LABEL: mul_16xi8: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 -; X64-SSE-NEXT: pxor %xmm2, %xmm2 -; X64-SSE-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X64-SSE-NEXT: movdqa %xmm1, %xmm4 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X64-SSE-NEXT: pmullw %xmm3, %xmm4 -; X64-SSE-NEXT: movdqa %xmm4, %xmm3 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] -; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX1-LABEL: mul_16xi8: -; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 -; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 -; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 -; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: mul_16xi8: -; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 -; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) -; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <16 x i8>* - %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1 - %tmp8 = zext <16 x i8> %wide.load to <16 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <16 x i8>* - %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1 - %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> - %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <16 x i32>* - store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 - ret void -} - -; %val1 = load <2 x i16> -; %op1 = zext<2 x i32> %val1 -; %val2 = load <2 x i16> -; %op2 = zext<2 x i32> %val2 -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_2xi16: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi16: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi16: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi16: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i16>* - %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 - %tmp8 = zext <2 x i16> %wide.load to <2 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <2 x i16>* - %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 - %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val1 = load <4 x i16> -; %op1 = zext<4 x i32> %val1 -; %val2 = load <4 x i16> -; %op2 = zext<4 x i32> %val2 -; %rst = mul <4 x i32> %op1, %op2 -; -define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_4xi16: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_4xi16: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_4xi16: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X64-SSE-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_4xi16: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <4 x i16>* - %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1 - %tmp8 = zext <4 x i16> %wide.load to <4 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <4 x i16>* - %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1 - %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32> - %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <4 x i32>* - store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 - ret void -} - -; %val1 = load <8 x i16> -; %op1 = zext<8 x i32> %val1 -; %val2 = load <8 x i16> -; %op2 = zext<8 x i32> %val2 -; %rst = mul <8 x i32> %op1, %op2 -; -define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_8xi16: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 -; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: movdqa %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX1-LABEL: mul_8xi16: -; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX1-NEXT: movl c, %esi -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4) -; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: mul_8xi16: -; X86-AVX2: # %bb.0: # %entry -; X86-AVX2-NEXT: pushl %esi -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX2-NEXT: movl c, %esi -; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 -; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) -; X86-AVX2-NEXT: popl %esi -; X86-AVX2-NEXT: vzeroupper -; X86-AVX2-NEXT: retl -; -; X64-SSE-LABEL: mul_8xi16: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 -; X64-SSE-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: movdqa %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX1-LABEL: mul_8xi16: -; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: mul_8xi16: -; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 -; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <8 x i16>* - %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1 - %tmp8 = zext <8 x i16> %wide.load to <8 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <8 x i16>* - %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1 - %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32> - %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <8 x i32>* - store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 - ret void -} - -; %val1 = load <16 x i16> -; %op1 = zext<16 x i32> %val1 -; %val2 = load <16 x i16> -; %op2 = zext<16 x i32> %val2 -; %rst = mul <16 x i32> %op1, %op2 -; -define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_16xi16: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 -; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 -; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 -; X86-SSE-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 -; X86-SSE-NEXT: pmullw %xmm0, %xmm2 -; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 -; X86-SSE-NEXT: pmullw %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX1-LABEL: mul_16xi16: -; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX1-NEXT: movl c, %esi -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) -; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: mul_16xi16: -; X86-AVX2: # %bb.0: # %entry -; X86-AVX2-NEXT: pushl %esi -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX2-NEXT: movl c, %esi -; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 -; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) -; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) -; X86-AVX2-NEXT: popl %esi -; X86-AVX2-NEXT: vzeroupper -; X86-AVX2-NEXT: retl -; -; X64-SSE-LABEL: mul_16xi16: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 -; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 -; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 -; X64-SSE-NEXT: movdqa %xmm2, %xmm4 -; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 -; X64-SSE-NEXT: pmullw %xmm0, %xmm2 -; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X64-SSE-NEXT: movdqa %xmm3, %xmm4 -; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 -; X64-SSE-NEXT: pmullw %xmm1, %xmm3 -; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX1-LABEL: mul_16xi16: -; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: mul_16xi16: -; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 -; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) -; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <16 x i16>* - %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 - %tmp8 = zext <16 x i16> %wide.load to <16 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <16 x i16>* - %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 - %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32> - %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <16 x i32>* - store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 - ret void -} - -; %val1 = load <2 x i8> -; %op1 = sext<2 x i32> %val1 -; %val2 = load <2 x i8> -; %op2 = sext<2 x i32> %val2 -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_2xi8_sext: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx -; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax -; X86-SSE-NEXT: movd %eax, %xmm1 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: psraw $8, %xmm0 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: psraw $8, %xmm1 -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi8_sext: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx -; X86-AVX-NEXT: vmovd %edx, %xmm0 -; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax -; X86-AVX-NEXT: vmovd %eax, %xmm1 -; X86-AVX-NEXT: vpmovsxbd %xmm1, %xmm1 -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi8_sext: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm1 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: psraw $8, %xmm0 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: psraw $8, %xmm1 -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi8_sext: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-NEXT: vpmovsxbd %xmm1, %xmm1 -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i8>* - %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 - %tmp8 = sext <2 x i8> %wide.load to <2 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <2 x i8>* - %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 - %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val1 = load <2 x i8> -; %op1 = sext<2 x i32> %val1 -; %val2 = load <2 x i8> -; %op2 = zext<2 x i32> %val2 -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_2xi8_sext_zext: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx -; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax -; X86-SSE-NEXT: movd %eax, %xmm1 -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: psraw $8, %xmm0 -; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 -; X86-SSE-NEXT: pmullw %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi8_sext_zext: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx -; X86-AVX-NEXT: vmovd %edx, %xmm0 -; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax -; X86-AVX-NEXT: vmovd %eax, %xmm1 -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi8_sext_zext: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm1 -; X64-SSE-NEXT: pxor %xmm2, %xmm2 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: psraw $8, %xmm0 -; X64-SSE-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 -; X64-SSE-NEXT: pmullw %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi8_sext_zext: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm1 -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i8>* - %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 - %tmp8 = sext <2 x i8> %wide.load to <2 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <2 x i8>* - %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 - %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val1 = load <2 x i16> -; %op1 = sext<2 x i32> %val1 -; %val2 = load <2 x i16> -; %op2 = sext<2 x i32> %val2 -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_2xi16_sext: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 -; X86-SSE-NEXT: pmullw %xmm0, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi16_sext: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovsxwd %xmm1, %xmm1 -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi16_sext: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 -; X64-SSE-NEXT: pmullw %xmm0, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi16_sext: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovsxwd %xmm1, %xmm1 -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i16>* - %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 - %tmp8 = sext <2 x i16> %wide.load to <2 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <2 x i16>* - %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 - %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val1 = load <2 x i16> -; %op1 = sext<2 x i32> %val1 -; %val2 = load <2 x i16> -; %op2 = zext<2 x i32> %val2 -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_2xi16_sext_zext: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi16_sext_zext: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: pushl %esi -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX-NEXT: movl c, %esi -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) -; X86-AVX-NEXT: popl %esi -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi16_sext_zext: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-SSE-NEXT: pxor %xmm2, %xmm2 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi16_sext_zext: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i16>* - %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 - %tmp8 = sext <2 x i16> %wide.load to <2 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <2 x i16>* - %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 - %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val1 = load <16 x i16> -; %op1 = sext<16 x i32> %val1 -; %val2 = load <16 x i16> -; %op2 = sext<16 x i32> %val2 -; %rst = mul <16 x i32> %op1, %op2 -; -define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind { -; X86-SSE-LABEL: mul_16xi16_sext: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SSE-NEXT: movl c, %esi -; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 -; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 -; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 -; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 -; X86-SSE-NEXT: movdqa %xmm2, %xmm4 -; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 -; X86-SSE-NEXT: pmullw %xmm0, %xmm2 -; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X86-SSE-NEXT: movdqa %xmm3, %xmm4 -; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 -; X86-SSE-NEXT: pmullw %xmm1, %xmm3 -; X86-SSE-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX1-LABEL: mul_16xi16_sext: -; X86-AVX1: # %bb.0: # %entry -; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX1-NEXT: movl c, %esi -; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm0 -; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm1 -; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm2 -; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm3 -; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4 -; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 -; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 -; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 -; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4 -; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 -; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 -; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4) -; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4) -; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: mul_16xi16_sext: -; X86-AVX2: # %bb.0: # %entry -; X86-AVX2-NEXT: pushl %esi -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-AVX2-NEXT: movl c, %esi -; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0 -; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1 -; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2 -; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 -; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2 -; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 -; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) -; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) -; X86-AVX2-NEXT: popl %esi -; X86-AVX2-NEXT: vzeroupper -; X86-AVX2-NEXT: retl -; -; X64-SSE-LABEL: mul_16xi16_sext: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 -; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 -; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 -; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 -; X64-SSE-NEXT: movdqa %xmm2, %xmm4 -; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 -; X64-SSE-NEXT: pmullw %xmm0, %xmm2 -; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; X64-SSE-NEXT: movdqa %xmm3, %xmm4 -; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 -; X64-SSE-NEXT: pmullw %xmm1, %xmm3 -; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) -; X64-SSE-NEXT: retq -; -; X64-AVX1-LABEL: mul_16xi16_sext: -; X64-AVX1: # %bb.0: # %entry -; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm0 -; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm1 -; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm2 -; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm3 -; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4 -; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 -; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 -; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 -; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4 -; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 -; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 -; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 -; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4) -; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4) -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: mul_16xi16_sext: -; X64-AVX2: # %bb.0: # %entry -; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0 -; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1 -; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2 -; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 -; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2 -; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 -; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) -; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <16 x i16>* - %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 - %tmp8 = sext <16 x i16> %wide.load to <16 x i32> - %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index - %tmp11 = bitcast i8* %tmp10 to <16 x i16>* - %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 - %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32> - %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <16 x i32>* - store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 - ret void -} - -; %val = load <2 x i8> -; %op1 = zext<2 x i32> %val -; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255) -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { -; X86-SSE-LABEL: mul_2xi8_varconst1: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl c, %edx -; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx -; X86-SSE-NEXT: movd %ecx, %xmm0 -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi8_varconst1: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi8_varconst1: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: pxor %xmm1, %xmm1 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi8_varconst1: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i8>* - %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 - %tmp8 = zext <2 x i8> %wide.load to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp8, - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val = load <2 x i8> -; %op1 = sext<2 x i32> %val -; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127) -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { -; X86-SSE-LABEL: mul_2xi8_varconst2: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl c, %edx -; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx -; X86-SSE-NEXT: movd %ecx, %xmm0 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: psraw $8, %xmm0 -; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi8_varconst2: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi8_varconst2: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: psraw $8, %xmm0 -; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi8_varconst2: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i8>* - %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 - %tmp8 = sext <2 x i8> %wide.load to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp8, - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val = load <2 x i8> -; %op1 = zext<2 x i32> %val -; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256) -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { -; X86-SSE-LABEL: mul_2xi8_varconst3: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl c, %edx -; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx -; X86-SSE-NEXT: movd %ecx, %xmm0 -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi8_varconst3: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi8_varconst3: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: pxor %xmm1, %xmm1 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi8_varconst3: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i8>* - %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 - %tmp8 = zext <2 x i8> %wide.load to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp8, - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val = load <2 x i8> -; %op1 = zext<2 x i32> %val -; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255) -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { -; X86-SSE-LABEL: mul_2xi8_varconst4: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl c, %edx -; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx -; X86-SSE-NEXT: movd %ecx, %xmm0 -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X86-SSE-NEXT: pmullw %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi8_varconst4: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi8_varconst4: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: pxor %xmm1, %xmm1 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X64-SSE-NEXT: pmullw %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi8_varconst4: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i8>* - %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 - %tmp8 = zext <2 x i8> %wide.load to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp8, - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val = load <2 x i8> -; %op1 = sext<2 x i32> %val -; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127) -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { -; X86-SSE-LABEL: mul_2xi8_varconst5: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl c, %edx -; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx -; X86-SSE-NEXT: movd %ecx, %xmm0 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: psraw $8, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X86-SSE-NEXT: pmullw %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi8_varconst5: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi8_varconst5: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: psraw $8, %xmm0 -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X64-SSE-NEXT: pmullw %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi8_varconst5: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i8>* - %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 - %tmp8 = sext <2 x i8> %wide.load to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp8, - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val = load <2 x i8> -; %op1 = sext<2 x i32> %val -; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128) -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { -; X86-SSE-LABEL: mul_2xi8_varconst6: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl c, %edx -; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx -; X86-SSE-NEXT: movd %ecx, %xmm0 -; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X86-SSE-NEXT: psraw $8, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X86-SSE-NEXT: pmullw %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi8_varconst6: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx -; X86-AVX-NEXT: vmovd %ecx, %xmm0 -; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi8_varconst6: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-SSE-NEXT: movd %ecx, %xmm0 -; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X64-SSE-NEXT: psraw $8, %xmm0 -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X64-SSE-NEXT: pmullw %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi8_varconst6: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx -; X64-AVX-NEXT: vmovd %ecx, %xmm0 -; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i8>* - %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 - %tmp8 = sext <2 x i8> %wide.load to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp8, - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val = load <2 x i16> -; %op1 = zext<2 x i32> %val -; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535) -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { -; X86-SSE-LABEL: mul_2xi16_varconst1: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl c, %edx -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 -; X86-SSE-NEXT: pmullw %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi16_varconst1: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi16_varconst1: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 -; X64-SSE-NEXT: pmullw %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi16_varconst1: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i16>* - %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 - %tmp8 = zext <2 x i16> %wide.load to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp8, - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val = load <2 x i16> -; %op1 = sext<2 x i32> %val -; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767) -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { -; X86-SSE-LABEL: mul_2xi16_varconst2: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl c, %edx -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X86-SSE-NEXT: pmullw %xmm1, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi16_varconst2: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi16_varconst2: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 -; X64-SSE-NEXT: pmullw %xmm1, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi16_varconst2: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i16>* - %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 - %tmp8 = sext <2 x i16> %wide.load to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp8, - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val = load <2 x i16> -; %op1 = zext<2 x i32> %val -; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536) -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { -; X86-SSE-LABEL: mul_2xi16_varconst3: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl c, %edx -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi16_varconst3: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi16_varconst3: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: pxor %xmm1, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u> -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi16_varconst3: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i16>* - %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 - %tmp8 = zext <2 x i16> %wide.load to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp8, - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; %val = load <2 x i16> -; %op1 = sext<2 x i32> %val -; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768) -; %rst = mul <2 x i32> %op1, %op2 -; -define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { -; X86-SSE-LABEL: mul_2xi16_varconst4: -; X86-SSE: # %bb.0: # %entry -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl c, %edx -; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) -; X86-SSE-NEXT: retl -; -; X86-AVX-LABEL: mul_2xi16_varconst4: -; X86-AVX: # %bb.0: # %entry -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX-NEXT: movl c, %edx -; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 -; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) -; X86-AVX-NEXT: retl -; -; X64-SSE-LABEL: mul_2xi16_varconst4: -; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: movq {{.*}}(%rip), %rax -; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u> -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) -; X64-SSE-NEXT: retq -; -; X64-AVX-LABEL: mul_2xi16_varconst4: -; X64-AVX: # %bb.0: # %entry -; X64-AVX-NEXT: movq {{.*}}(%rip), %rax -; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) -; X64-AVX-NEXT: retq -entry: - %pre = load i32*, i32** @c - %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index - %tmp7 = bitcast i8* %tmp6 to <2 x i16>* - %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 - %tmp8 = sext <2 x i16> %wide.load to <2 x i32> - %tmp13 = mul nuw nsw <2 x i32> %tmp8, - %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index - %tmp15 = bitcast i32* %tmp14 to <2 x i32>* - store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 - ret void -} - -; -; Illegal Types -; - -define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind { -; X86-SSE-LABEL: PR34947: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: pushl %esi -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: movdqa (%eax), %xmm5 -; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: movdqa (%ecx), %xmm2 -; X86-SSE-NEXT: movdqa 16(%ecx), %xmm6 -; X86-SSE-NEXT: pxor %xmm0, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X86-SSE-NEXT: movdqa %xmm5, %xmm4 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] -; X86-SSE-NEXT: movd %xmm0, %eax -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] -; X86-SSE-NEXT: movd %xmm0, %esi -; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl %esi -; X86-SSE-NEXT: movd %edx, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1] -; X86-SSE-NEXT: movd %xmm3, %eax -; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,0,1] -; X86-SSE-NEXT: movd %xmm3, %esi -; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl %esi -; X86-SSE-NEXT: movd %edx, %xmm7 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] -; X86-SSE-NEXT: movd %xmm5, %eax -; X86-SSE-NEXT: movd %xmm6, %esi -; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl %esi -; X86-SSE-NEXT: movd %edx, %xmm3 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; X86-SSE-NEXT: movd %xmm5, %eax -; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] -; X86-SSE-NEXT: movd %xmm5, %esi -; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl %esi -; X86-SSE-NEXT: movd %edx, %xmm5 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3] -; X86-SSE-NEXT: movd %xmm6, %eax -; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3] -; X86-SSE-NEXT: movd %xmm6, %esi -; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl %esi -; X86-SSE-NEXT: movd %edx, %xmm6 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1] -; X86-SSE-NEXT: movd %xmm7, %eax -; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1] -; X86-SSE-NEXT: movd %xmm7, %esi -; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl %esi -; X86-SSE-NEXT: movd %edx, %xmm7 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; X86-SSE-NEXT: movd %xmm4, %eax -; X86-SSE-NEXT: movd %xmm2, %esi -; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl %esi -; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3] -; X86-SSE-NEXT: movd %xmm4, %eax -; X86-SSE-NEXT: movd %edx, %xmm4 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; X86-SSE-NEXT: movd %xmm2, %esi -; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl %esi -; X86-SSE-NEXT: movd %edx, %xmm2 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] -; X86-SSE-NEXT: movd %xmm1, %eax -; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm4 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; X86-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm0[0,0] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm5 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-SSE-NEXT: xorl %edx, %edx -; X86-SSE-NEXT: divl 32(%ecx) -; X86-SSE-NEXT: movdqa %xmm0, (%eax) -; X86-SSE-NEXT: movdqa %xmm4, (%eax) -; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 -; X86-SSE-NEXT: movl %eax, (%eax) -; X86-SSE-NEXT: popl %esi -; X86-SSE-NEXT: retl -; -; X86-AVX1-LABEL: PR34947: -; X86-AVX1: # %bb.0: -; X86-AVX1-NEXT: pushl %ebp -; X86-AVX1-NEXT: pushl %ebx -; X86-AVX1-NEXT: pushl %edi -; X86-AVX1-NEXT: pushl %esi -; X86-AVX1-NEXT: subl $16, %esp -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X86-AVX1-NEXT: vmovd %xmm1, %eax -; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: divl 32(%ecx) -; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax -; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm1 -; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm3 -; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx -; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-AVX1-NEXT: vpextrd $2, %xmm2, %eax -; X86-AVX1-NEXT: vpextrd $2, %xmm3, %ecx -; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-AVX1-NEXT: vpextrd $1, %xmm2, %eax -; X86-AVX1-NEXT: vpextrd $1, %xmm3, %ecx -; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-AVX1-NEXT: vmovd %xmm2, %eax -; X86-AVX1-NEXT: vmovd %xmm3, %ecx -; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, %ebp -; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax -; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: movl %edx, %ebx -; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax -; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi -; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: divl %esi -; X86-AVX1-NEXT: movl %edx, %esi -; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax -; X86-AVX1-NEXT: vpextrd $1, %xmm1, %edi -; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: divl %edi -; X86-AVX1-NEXT: movl %edx, %edi -; X86-AVX1-NEXT: vmovd %xmm0, %eax -; X86-AVX1-NEXT: vmovd %xmm1, %ecx -; X86-AVX1-NEXT: xorl %edx, %edx -; X86-AVX1-NEXT: divl %ecx -; X86-AVX1-NEXT: vmovd %edx, %xmm0 -; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0 -; X86-AVX1-NEXT: vmovd %ebp, %xmm1 -; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload -; X86-AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; X86-AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload -; X86-AVX1-NEXT: imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-AVX1-NEXT: # imm = 0x2007 -; X86-AVX1-NEXT: movl %eax, (%eax) -; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199] -; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 -; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; X86-AVX1-NEXT: vmovdqa %xmm1, (%eax) -; X86-AVX1-NEXT: vmovdqa %xmm0, (%eax) -; X86-AVX1-NEXT: addl $16, %esp -; X86-AVX1-NEXT: popl %esi -; X86-AVX1-NEXT: popl %edi -; X86-AVX1-NEXT: popl %ebx -; X86-AVX1-NEXT: popl %ebp -; X86-AVX1-NEXT: retl -; -; X86-AVX2-LABEL: PR34947: -; X86-AVX2: # %bb.0: -; X86-AVX2-NEXT: pushl %edi -; X86-AVX2-NEXT: pushl %esi -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X86-AVX2-NEXT: vmovdqa (%esi), %xmm2 -; X86-AVX2-NEXT: vmovdqa 16(%esi), %xmm3 -; X86-AVX2-NEXT: vpextrd $1, %xmm3, %ecx -; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; X86-AVX2-NEXT: vpextrd $1, %xmm4, %eax -; X86-AVX2-NEXT: xorl %edx, %edx -; X86-AVX2-NEXT: divl %ecx -; X86-AVX2-NEXT: movl %edx, %ecx -; X86-AVX2-NEXT: vmovd %xmm3, %edi -; X86-AVX2-NEXT: vmovd %xmm4, %eax -; X86-AVX2-NEXT: xorl %edx, %edx -; X86-AVX2-NEXT: divl %edi -; X86-AVX2-NEXT: vmovd %edx, %xmm5 -; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5 -; X86-AVX2-NEXT: vpextrd $2, %xmm3, %ecx -; X86-AVX2-NEXT: vpextrd $2, %xmm4, %eax -; X86-AVX2-NEXT: xorl %edx, %edx -; X86-AVX2-NEXT: divl %ecx -; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 -; X86-AVX2-NEXT: vpextrd $3, %xmm3, %ecx -; X86-AVX2-NEXT: vpextrd $3, %xmm4, %eax -; X86-AVX2-NEXT: xorl %edx, %edx -; X86-AVX2-NEXT: divl %ecx -; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3 -; X86-AVX2-NEXT: vpextrd $1, %xmm2, %ecx -; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax -; X86-AVX2-NEXT: xorl %edx, %edx -; X86-AVX2-NEXT: divl %ecx -; X86-AVX2-NEXT: movl %edx, %ecx -; X86-AVX2-NEXT: vmovd %xmm2, %edi -; X86-AVX2-NEXT: vmovd %xmm1, %eax -; X86-AVX2-NEXT: xorl %edx, %edx -; X86-AVX2-NEXT: divl %edi -; X86-AVX2-NEXT: vmovd %edx, %xmm4 -; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4 -; X86-AVX2-NEXT: vpextrd $2, %xmm2, %ecx -; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax -; X86-AVX2-NEXT: xorl %edx, %edx -; X86-AVX2-NEXT: divl %ecx -; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 -; X86-AVX2-NEXT: vpextrd $3, %xmm2, %ecx -; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax -; X86-AVX2-NEXT: xorl %edx, %edx -; X86-AVX2-NEXT: divl %ecx -; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1 -; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; X86-AVX2-NEXT: vmovd %xmm0, %eax -; X86-AVX2-NEXT: xorl %edx, %edx -; X86-AVX2-NEXT: divl 32(%esi) -; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199] -; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 -; X86-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007 -; X86-AVX2-NEXT: movl %eax, (%eax) -; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) -; X86-AVX2-NEXT: popl %esi -; X86-AVX2-NEXT: popl %edi -; X86-AVX2-NEXT: vzeroupper -; X86-AVX2-NEXT: retl -; -; X64-SSE-LABEL: PR34947: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa (%rdi), %xmm5 -; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-SSE-NEXT: movdqa (%rsi), %xmm2 -; X64-SSE-NEXT: movdqa 16(%rsi), %xmm6 -; X64-SSE-NEXT: pxor %xmm0, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X64-SSE-NEXT: movdqa %xmm5, %xmm3 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3] -; X64-SSE-NEXT: movd %xmm0, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3] -; X64-SSE-NEXT: movd %xmm0, %ecx -; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %ecx -; X64-SSE-NEXT: movd %edx, %xmm8 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1] -; X64-SSE-NEXT: movd %xmm4, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,0,1] -; X64-SSE-NEXT: movd %xmm4, %ecx -; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %ecx -; X64-SSE-NEXT: movd %edx, %xmm7 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; X64-SSE-NEXT: movd %xmm5, %eax -; X64-SSE-NEXT: movd %xmm6, %ecx -; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %ecx -; X64-SSE-NEXT: movd %edx, %xmm4 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3] -; X64-SSE-NEXT: movd %xmm5, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3] -; X64-SSE-NEXT: movd %xmm5, %ecx -; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %ecx -; X64-SSE-NEXT: movd %edx, %xmm5 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,1,2,3] -; X64-SSE-NEXT: movd %xmm6, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3] -; X64-SSE-NEXT: movd %xmm6, %ecx -; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %ecx -; X64-SSE-NEXT: movd %edx, %xmm6 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1] -; X64-SSE-NEXT: movd %xmm7, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1] -; X64-SSE-NEXT: movd %xmm7, %ecx -; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %ecx -; X64-SSE-NEXT: movd %edx, %xmm7 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; X64-SSE-NEXT: movd %xmm3, %eax -; X64-SSE-NEXT: movd %xmm2, %ecx -; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %ecx -; X64-SSE-NEXT: movd %edx, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; X64-SSE-NEXT: movd %xmm3, %eax -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; X64-SSE-NEXT: movd %xmm2, %ecx -; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl %ecx -; X64-SSE-NEXT: movd %edx, %xmm2 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0] -; X64-SSE-NEXT: movd %xmm1, %eax -; X64-SSE-NEXT: xorl %edx, %edx -; X64-SSE-NEXT: divl 32(%rsi) -; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X64-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0] -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X64-SSE-NEXT: pmuludq %xmm1, %xmm4 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] -; X64-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[0,0] -; X64-SSE-NEXT: pmuludq %xmm1, %xmm5 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3] -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007 -; X64-SSE-NEXT: movl %eax, (%rax) -; X64-SSE-NEXT: movdqa %xmm2, (%rax) -; X64-SSE-NEXT: movdqa %xmm0, (%rax) -; X64-SSE-NEXT: retq -; -; X64-AVX1-LABEL: PR34947: -; X64-AVX1: # %bb.0: -; X64-AVX1-NEXT: pushq %rbp -; X64-AVX1-NEXT: pushq %rbx -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; X64-AVX1-NEXT: vmovd %xmm1, %eax -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl 32(%rsi) -; X64-AVX1-NEXT: movl %edx, %r8d -; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax -; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm1 -; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm3 -; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %ecx -; X64-AVX1-NEXT: movl %edx, %r9d -; X64-AVX1-NEXT: vpextrd $2, %xmm2, %eax -; X64-AVX1-NEXT: vpextrd $2, %xmm3, %ecx -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %ecx -; X64-AVX1-NEXT: movl %edx, %r10d -; X64-AVX1-NEXT: vpextrd $1, %xmm2, %eax -; X64-AVX1-NEXT: vpextrd $1, %xmm3, %ecx -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %ecx -; X64-AVX1-NEXT: movl %edx, %r11d -; X64-AVX1-NEXT: vmovd %xmm2, %eax -; X64-AVX1-NEXT: vmovd %xmm3, %ecx -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %ecx -; X64-AVX1-NEXT: movl %edx, %esi -; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax -; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %ecx -; X64-AVX1-NEXT: movl %edx, %edi -; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax -; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %ecx -; X64-AVX1-NEXT: movl %edx, %ecx -; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax -; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ebx -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %ebx -; X64-AVX1-NEXT: movl %edx, %ebx -; X64-AVX1-NEXT: vmovd %xmm0, %eax -; X64-AVX1-NEXT: vmovd %xmm1, %ebp -; X64-AVX1-NEXT: xorl %edx, %edx -; X64-AVX1-NEXT: divl %ebp -; X64-AVX1-NEXT: vmovd %edx, %xmm0 -; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] -; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; X64-AVX1-NEXT: vmovd %esi, %xmm2 -; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 -; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; X64-AVX1-NEXT: imull $8199, %r8d, %eax # imm = 0x2007 -; X64-AVX1-NEXT: movl %eax, (%rax) -; X64-AVX1-NEXT: vmovdqa %xmm1, (%rax) -; X64-AVX1-NEXT: vmovdqa %xmm0, (%rax) -; X64-AVX1-NEXT: popq %rbx -; X64-AVX1-NEXT: popq %rbp -; X64-AVX1-NEXT: retq -; -; X64-AVX2-LABEL: PR34947: -; X64-AVX2: # %bb.0: -; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; X64-AVX2-NEXT: vmovdqa (%rsi), %xmm2 -; X64-AVX2-NEXT: vmovdqa 16(%rsi), %xmm3 -; X64-AVX2-NEXT: vpextrd $1, %xmm3, %ecx -; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4 -; X64-AVX2-NEXT: vpextrd $1, %xmm4, %eax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl %ecx -; X64-AVX2-NEXT: movl %edx, %ecx -; X64-AVX2-NEXT: vmovd %xmm3, %edi -; X64-AVX2-NEXT: vmovd %xmm4, %eax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl %edi -; X64-AVX2-NEXT: vmovd %edx, %xmm5 -; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5 -; X64-AVX2-NEXT: vpextrd $2, %xmm3, %ecx -; X64-AVX2-NEXT: vpextrd $2, %xmm4, %eax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl %ecx -; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5 -; X64-AVX2-NEXT: vpextrd $3, %xmm3, %ecx -; X64-AVX2-NEXT: vpextrd $3, %xmm4, %eax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl %ecx -; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3 -; X64-AVX2-NEXT: vpextrd $1, %xmm2, %ecx -; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl %ecx -; X64-AVX2-NEXT: movl %edx, %ecx -; X64-AVX2-NEXT: vmovd %xmm2, %edi -; X64-AVX2-NEXT: vmovd %xmm1, %eax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl %edi -; X64-AVX2-NEXT: vmovd %edx, %xmm4 -; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4 -; X64-AVX2-NEXT: vpextrd $2, %xmm2, %ecx -; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl %ecx -; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4 -; X64-AVX2-NEXT: vpextrd $3, %xmm2, %ecx -; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl %ecx -; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1 -; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 -; X64-AVX2-NEXT: vmovd %xmm0, %eax -; X64-AVX2-NEXT: xorl %edx, %edx -; X64-AVX2-NEXT: divl 32(%rsi) -; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199] -; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 -; X64-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007 -; X64-AVX2-NEXT: movl %eax, (%rax) -; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) -; X64-AVX2-NEXT: vzeroupper -; X64-AVX2-NEXT: retq - %a0 = load <9 x i16>, <9 x i16>* %p0, align 64 - %a1 = load <9 x i32>, <9 x i32>* %p1, align 64 - %ext0 = zext <9 x i16> %a0 to <9 x i32> - %rem = urem <9 x i32> %ext0, %a1 - %mul = mul <9 x i32> , %rem - store <9 x i32> %mul, <9 x i32>* undef, align 64 - ret void -} diff --git a/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll b/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll deleted file mode 100644 index fae69ea5d80..00000000000 --- a/test/CodeGen/X86/shuffle-vs-trunc-128-widen.ll +++ /dev/null @@ -1,574 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL - -; PR31551 -; Pairs of shufflevector:trunc functions with functional equivalence. -; Ideally, the shuffles should be lowered to code with the same quality as the truncates. - -define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind { -; SSE2-LABEL: shuffle_v16i8_to_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rsi) -; SSE2-NEXT: retq -; -; SSE42-LABEL: shuffle_v16i8_to_v8i8: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSE42-NEXT: movq %xmm0, (%rsi) -; SSE42-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_to_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512-LABEL: shuffle_v16i8_to_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: retq - %vec = load <16 x i8>, <16 x i8>* %L - %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> - store <8 x i8> %strided.vec, <8 x i8>* %S - ret void -} - -define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind { -; SSE2-LABEL: trunc_v8i16_to_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rsi) -; SSE2-NEXT: retq -; -; SSE42-LABEL: trunc_v8i16_to_v8i8: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSE42-NEXT: movq %xmm0, (%rsi) -; SSE42-NEXT: retq -; -; AVX-LABEL: trunc_v8i16_to_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512F-LABEL: trunc_v8i16_to_v8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v8i16_to_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v8i16_to_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq - %vec = load <16 x i8>, <16 x i8>* %L - %bc = bitcast <16 x i8> %vec to <8 x i16> - %strided.vec = trunc <8 x i16> %bc to <8 x i8> - store <8 x i8> %strided.vec, <8 x i8>* %S - ret void -} - -define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind { -; SSE2-LABEL: shuffle_v8i16_to_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rsi) -; SSE2-NEXT: retq -; -; SSE42-LABEL: shuffle_v8i16_to_v4i16: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE42-NEXT: movq %xmm0, (%rsi) -; SSE42-NEXT: retq -; -; AVX-LABEL: shuffle_v8i16_to_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512-LABEL: shuffle_v8i16_to_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512-NEXT: vmovq %xmm0, (%rsi) -; AVX512-NEXT: retq - %vec = load <8 x i16>, <8 x i16>* %L - %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> - store <4 x i16> %strided.vec, <4 x i16>* %S - ret void -} - -define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind { -; SSE2-LABEL: trunc_v4i32_to_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, (%rsi) -; SSE2-NEXT: retq -; -; SSE42-LABEL: trunc_v4i32_to_v4i16: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE42-NEXT: movq %xmm0, (%rsi) -; SSE42-NEXT: retq -; -; AVX-LABEL: trunc_v4i32_to_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512F-LABEL: trunc_v4i32_to_v4i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v4i32_to_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v4i32_to_v4i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq - %vec = load <8 x i16>, <8 x i16>* %L - %bc = bitcast <8 x i16> %vec to <4 x i32> - %strided.vec = trunc <4 x i32> %bc to <4 x i16> - store <4 x i16> %strided.vec, <4 x i16>* %S - ret void -} - -define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind { -; SSE-LABEL: shuffle_v4i32_to_v2i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; SSE-NEXT: movq %xmm0, (%rsi) -; SSE-NEXT: retq -; -; AVX-LABEL: shuffle_v4i32_to_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT: vmovlps %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512-LABEL: shuffle_v4i32_to_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512-NEXT: vmovlps %xmm0, (%rsi) -; AVX512-NEXT: retq - %vec = load <4 x i32>, <4 x i32>* %L - %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> - store <2 x i32> %strided.vec, <2 x i32>* %S - ret void -} - -define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind { -; SSE-LABEL: trunc_v2i64_to_v2i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; SSE-NEXT: movq %xmm0, (%rsi) -; SSE-NEXT: retq -; -; AVX-LABEL: trunc_v2i64_to_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX-NEXT: vmovlps %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512F-LABEL: trunc_v2i64_to_v2i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vmovlps %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v2i64_to_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v2i64_to_v2i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512BW-NEXT: vmovlps %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq - %vec = load <4 x i32>, <4 x i32>* %L - %bc = bitcast <4 x i32> %vec to <2 x i64> - %strided.vec = trunc <2 x i64> %bc to <2 x i32> - store <2 x i32> %strided.vec, <2 x i32>* %S - ret void -} - -define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind { -; SSE2-LABEL: shuffle_v16i8_to_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movd %xmm0, (%rsi) -; SSE2-NEXT: retq -; -; SSE42-LABEL: shuffle_v16i8_to_v4i8: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE42-NEXT: movd %xmm0, (%rsi) -; SSE42-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_to_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512-LABEL: shuffle_v16i8_to_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovd %xmm0, (%rsi) -; AVX512-NEXT: retq - %vec = load <16 x i8>, <16 x i8>* %L - %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> - store <4 x i8> %strided.vec, <4 x i8>* %S - ret void -} - -define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind { -; SSE2-LABEL: trunc_v4i32_to_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movd %xmm0, (%rsi) -; SSE2-NEXT: retq -; -; SSE42-LABEL: trunc_v4i32_to_v4i8: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE42-NEXT: movd %xmm0, (%rsi) -; SSE42-NEXT: retq -; -; AVX-LABEL: trunc_v4i32_to_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512F-LABEL: trunc_v4i32_to_v4i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v4i32_to_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v4i32_to_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq - %vec = load <16 x i8>, <16 x i8>* %L - %bc = bitcast <16 x i8> %vec to <4 x i32> - %strided.vec = trunc <4 x i32> %bc to <4 x i8> - store <4 x i8> %strided.vec, <4 x i8>* %S - ret void -} - -define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { -; SSE-LABEL: shuffle_v8i16_to_v2i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movd %xmm0, (%rsi) -; SSE-NEXT: retq -; -; AVX1-LABEL: shuffle_v8i16_to_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: shuffle_v8i16_to_v2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v8i16_to_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vmovd %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v8i16_to_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq - %vec = load <8 x i16>, <8 x i16>* %L - %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> - store <2 x i16> %strided.vec, <2 x i16>* %S - ret void -} - -define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind { -; SSE-LABEL: trunc_v2i64_to_v2i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: movd %xmm0, (%rsi) -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_v2i64_to_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vmovd %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi) -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi) -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_v2i64_to_v2i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v2i64_to_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v2i64_to_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq - %vec = load <8 x i16>, <8 x i16>* %L - %bc = bitcast <8 x i16> %vec to <2 x i64> - %strided.vec = trunc <2 x i64> %bc to <2 x i16> - store <2 x i16> %strided.vec, <2 x i16>* %S - ret void -} - -define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind { -; SSE2-LABEL: shuffle_v16i8_to_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rsi) -; SSE2-NEXT: retq -; -; SSE42-LABEL: shuffle_v16i8_to_v2i8: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) -; SSE42-NEXT: retq -; -; AVX-LABEL: shuffle_v16i8_to_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512-LABEL: shuffle_v16i8_to_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512-NEXT: retq - %vec = load <16 x i8>, <16 x i8>* %L - %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> - store <2 x i8> %strided.vec, <2 x i8>* %S - ret void -} - -define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind { -; SSE2-LABEL: trunc_v2i64_to_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: movw %ax, (%rsi) -; SSE2-NEXT: retq -; -; SSE42-LABEL: trunc_v2i64_to_v2i8: -; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) -; SSE42-NEXT: retq -; -; AVX-LABEL: trunc_v2i64_to_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512F-LABEL: trunc_v2i64_to_v2i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v2i64_to_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v2i64_to_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq - %vec = load <16 x i8>, <16 x i8>* %L - %bc = bitcast <16 x i8> %vec to <2 x i64> - %strided.vec = trunc <2 x i64> %bc to <2 x i8> - store <2 x i8> %strided.vec, <2 x i8>* %S - ret void -} diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll b/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll deleted file mode 100644 index ef5b8665e16..00000000000 --- a/test/CodeGen/X86/shuffle-vs-trunc-256-widen.ll +++ /dev/null @@ -1,1454 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL - -; PR31551 -; Pairs of shufflevector:trunc functions with functional equivalence. -; Ideally, the shuffles should be lowered to code with the same quality as the truncates. - -define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512F-LABEL: shuffle_v32i8_to_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1 -; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) -; AVX512VBMIVL-NEXT: retq - %vec = load <32 x i8>, <32 x i8>* %L - %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> - store <16 x i8> %strided.vec, <16 x i8>* %S - ret void -} - -define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX1-LABEL: trunc_v16i16_to_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v16i16_to_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_v16i16_to_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v16i16_to_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v16i16_to_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpmovwb %ymm0, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <32 x i8>, <32 x i8>* %L - %bc = bitcast <32 x i8> %vec to <16 x i16> - %strided.vec = trunc <16 x i16> %bc to <16 x i8> - store <16 x i8> %strided.vec, <16 x i8>* %S - ret void -} - -define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { -; AVX-LABEL: shuffle_v16i16_to_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vmovdqa %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512F-LABEL: shuffle_v16i16_to_v8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_to_v8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i16_to_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14] -; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14] -; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) -; AVX512VBMIVL-NEXT: retq - %vec = load <16 x i16>, <16 x i16>* %L - %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> - store <8 x i16> %strided.vec, <8 x i16>* %S - ret void -} - -define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind { -; AVX1-LABEL: trunc_v8i32_to_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vmovdqa %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v8i32_to_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vmovdqa %xmm0, (%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_v8i32_to_v8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v8i32_to_v8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v8i32_to_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpmovdw %ymm0, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <16 x i16>, <16 x i16>* %L - %bc = bitcast <16 x i16> %vec to <8 x i32> - %strided.vec = trunc <8 x i32> %bc to <8 x i16> - store <8 x i16> %strided.vec, <8 x i16>* %S - ret void -} - -define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { -; AVX-LABEL: shuffle_v8i32_to_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX-NEXT: vmovaps %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512-LABEL: shuffle_v8i32_to_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX512-NEXT: vmovaps %xmm0, (%rsi) -; AVX512-NEXT: retq - %vec = load <8 x i32>, <8 x i32>* %L - %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> - store <4 x i32> %strided.vec, <4 x i32>* %S - ret void -} - -define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind { -; AVX1-LABEL: trunc_v4i64_to_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX1-NEXT: vmovaps %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2] -; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi) -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps (%rdi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsi) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_v4i64_to_v4i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v4i64_to_v4i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v4i64_to_v4i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <8 x i32>, <8 x i32>* %L - %bc = bitcast <8 x i32> %vec to <4 x i64> - %strided.vec = trunc <4 x i64> %bc to <4 x i32> - store <4 x i32> %strided.vec, <4 x i32>* %S - ret void -} - -define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512F-LABEL: shuffle_v32i8_to_v8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2024390091656922112,2024390091656922112] -; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1 -; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) -; AVX512VBMIVL-NEXT: retq - %vec = load <32 x i8>, <32 x i8>* %L - %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> - store <8 x i8> %strided.vec, <8 x i8>* %S - ret void -} - -define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX-LABEL: trunc_v8i32_to_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX-NEXT: vmovq %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512F-LABEL: trunc_v8i32_to_v8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v8i32_to_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v8i32_to_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpmovdb %ymm0, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <32 x i8>, <32 x i8>* %L - %bc = bitcast <32 x i8> %vec to <8 x i32> - %strided.vec = trunc <8 x i32> %bc to <8 x i8> - store <8 x i8> %strided.vec, <8 x i8>* %S - ret void -} - -define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind { -; IR generated from: -; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0}; -; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %truncated.vec = trunc <8 x i32> %vec to <8 x i8> - %bc = bitcast <8 x i8> %truncated.vec to i64 - %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0 - ret <2 x i64> %result -} - -define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind { -; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %truncated = trunc <8 x i32> %vec to <8 x i8> - %truncated.ext = zext <8 x i8> %truncated to <8 x i16> - %bc = bitcast <8 x i16> %truncated.ext to <16 x i8> - %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> - ret <16 x i8> %result -} - -define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind { -; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %truncated = trunc <8 x i32> %vec to <8 x i16> - %bc = bitcast <8 x i16> %truncated to <16 x i8> - %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> - ret <16 x i8> %result -} - -define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind { -; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %truncated = trunc <8 x i32> %vec to <8 x i8> - %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> - ret <16 x i8> %result -} - -define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind { -; IR generated from: -; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0}; -; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %truncated = trunc <4 x i64> %vec to <4 x i16> - %bc = bitcast <4 x i16> %truncated to i64 - %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0 - ret <2 x i64> %result -} - -define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind { -; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %truncated = trunc <4 x i64> %vec to <4 x i16> - %truncated.ext = zext <4 x i16> %truncated to <4 x i32> - %bc = bitcast <4 x i32> %truncated.ext to <8 x i16> - %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> - ret <8 x i16> %result -} - -define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind { -; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %truncated = trunc <4 x i64> %vec to <4 x i32> - %bc = bitcast <4 x i32> %truncated to <8 x i16> - %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> - ret <8 x i16> %result -} - -define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind { -; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %truncated = trunc <4 x i64> %vec to <4 x i16> - %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> - ret <8 x i16> %result -} - -define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind { -; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0 -; AVX512VBMIVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VBMIVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %truncated = trunc <4 x i64> %vec to <4 x i8> - %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> - ret <16 x i8> %result -} - -define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { -; AVX1-LABEL: shuffle_v16i16_to_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: shuffle_v16i16_to_v4i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i16_to_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i16_to_v4i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13] -; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi) -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13] -; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1 -; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) -; AVX512VBMIVL-NEXT: retq - %vec = load <16 x i16>, <16 x i16>* %L - %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> - store <4 x i16> %strided.vec, <4 x i16>* %S - ret void -} - -define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind { -; AVX1-LABEL: trunc_v4i64_to_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-NEXT: vmovq %xmm0, (%rsi) -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi) -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi) -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_v4i64_to_v4i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v4i64_to_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v4i64_to_v4i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpmovqw %ymm0, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <16 x i16>, <16 x i16>* %L - %bc = bitcast <16 x i16> %vec to <4 x i64> - %strided.vec = trunc <4 x i64> %bc to <4 x i16> - store <4 x i16> %strided.vec, <4 x i16>* %S - ret void -} - -define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { -; AVX-LABEL: shuffle_v32i8_to_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512F-LABEL: shuffle_v32i8_to_v4i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i8_to_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vmovd %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i8_to_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMIVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [403703808,403703808,403703808,403703808] -; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1 -; AVX512VBMIVL-NEXT: vmovd %xmm1, (%rsi) -; AVX512VBMIVL-NEXT: retq - %vec = load <32 x i8>, <32 x i8>* %L - %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> - store <4 x i8> %strided.vec, <4 x i8>* %S - ret void -} - -define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind { -; AVX-LABEL: trunc_v4i64_to_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: vmovd %xmm0, (%rsi) -; AVX-NEXT: retq -; -; AVX512F-LABEL: trunc_v4i64_to_v4i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512F-NEXT: vmovd %xmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v4i64_to_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v4i64_to_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpmovqb %ymm0, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <32 x i8>, <32 x i8>* %L - %bc = bitcast <32 x i8> %vec to <4 x i64> - %strided.vec = trunc <4 x i64> %bc to <4 x i8> - store <4 x i8> %strided.vec, <4 x i8>* %S - ret void -} - -; In this case not all elements are collected from the same source vector, so -; the resulting BUILD_VECTOR should not be combined to a truncate. -define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind { -; AVX1-LABEL: negative: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14] -; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: negative: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: negative: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: negative: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: negative: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: negative: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] -; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001 -; AVX512BWVL-NEXT: kmovd %eax, %k1 -; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} -; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3] -; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMIVL-LABEL: negative: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,48,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30] -; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 -; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> - %w0 = extractelement <32 x i8> %w, i32 0 - %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0 - ret <16 x i8> %merged -} diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll b/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll deleted file mode 100644 index f9eae50039c..00000000000 --- a/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll +++ /dev/null @@ -1,903 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL - -; PR31551 -; Pairs of shufflevector:trunc functions with functional equivalence. -; Ideally, the shuffles should be lowered to code with the same quality as the truncates. - -define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v32i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512F-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] -; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7] -; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] -; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] -; AVX512VBMI-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VBMI-NEXT: vmovdqa %ymm0, (%rsi) -; AVX512VBMI-NEXT: vzeroupper -; AVX512VBMI-NEXT: retq -; -; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] -; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <64 x i8>, <64 x i8>* %L - %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> - store <32 x i8> %strided.vec, <32 x i8>* %S - ret void -} - -define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind { -; AVX512F-LABEL: trunc_v32i16_to_v32i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi) -; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_v32i16_to_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi) -; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_v32i16_to_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi) -; AVX512VBMI-NEXT: vzeroupper -; AVX512VBMI-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <64 x i8>, <64 x i8>* %L - %bc = bitcast <64 x i8> %vec to <32 x i16> - %strided.vec = trunc <32 x i16> %bc to <32 x i8> - store <32 x i8> %strided.vec, <32 x i8>* %S - ret void -} - -define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { -; AVX512F-LABEL: shuffle_v32i16_to_v16i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15] -; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15] -; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6] -; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512F-NEXT: vmovaps %ymm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i16_to_v16i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] -; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14] -; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 -; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i16_to_v16i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] -; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512BW-NEXT: vmovaps %ymm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31] -; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] -; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi) -; AVX512VBMI-NEXT: vzeroupper -; AVX512VBMI-NEXT: retq -; -; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <32 x i16>, <32 x i16>* %L - %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> - store <16 x i16> %strided.vec, <16 x i16>* %S - ret void -} - -define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind { -; AVX512-LABEL: trunc_v16i32_to_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovdw %zmm0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %vec = load <32 x i16>, <32 x i16>* %L - %bc = bitcast <32 x i16> %vec to <16 x i32> - %strided.vec = trunc <16 x i32> %bc to <16 x i16> - store <16 x i16> %strided.vec, <16 x i16>* %S - ret void -} - -define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { -; AVX512F-LABEL: shuffle_v16i32_to_v8i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] -; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512F-NEXT: vmovaps %ymm0, (%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v16i32_to_v8i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14] -; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 -; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v16i32_to_v8i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovaps (%rdi), %ymm0 -; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] -; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512BW-NEXT: vmovaps %ymm0, (%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14] -; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0 -; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6] -; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi) -; AVX512VBMI-NEXT: vzeroupper -; AVX512VBMI-NEXT: retq -; -; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14] -; AVX512VBMIVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <16 x i32>, <16 x i32>* %L - %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> - store <8 x i32> %strided.vec, <8 x i32>* %S - ret void -} - -define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind { -; AVX512-LABEL: trunc_v8i64_to_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %vec = load <16 x i32>, <16 x i32>* %L - %bc = bitcast <16 x i32> %vec to <8 x i64> - %strided.vec = trunc <8 x i64> %bc to <8 x i32> - store <8 x i32> %strided.vec, <8 x i32>* %S - ret void -} - -define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VBMI-NEXT: retq -; -; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60] -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <64 x i8>, <64 x i8>* %L - %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> - store <16 x i8> %strided.vec, <16 x i8>* %S - ret void -} - -define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind { -; AVX512-LABEL: trunc_v16i32_to_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %vec = load <64 x i8>, <64 x i8>* %L - %bc = bitcast <64 x i8> %vec to <16 x i32> - %strided.vec = trunc <16 x i32> %bc to <16 x i8> - store <16 x i8> %strided.vec, <16 x i8>* %S - ret void -} - -define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind { -; AVX512F-LABEL: shuffle_v32i16_to_v8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3] -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v32i16_to_v8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v32i16_to_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28] -; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] -; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi) -; AVX512VBMI-NEXT: retq -; -; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28] -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1 -; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <32 x i16>, <32 x i16>* %L - %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> - store <8 x i16> %strided.vec, <8 x i16>* %S - ret void -} - -define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind { -; AVX512-LABEL: trunc_v8i64_to_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovqw %zmm0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %vec = load <32 x i16>, <32 x i16>* %L - %bc = bitcast <32 x i16> %vec to <8 x i64> - %strided.vec = trunc <8 x i64> %bc to <8 x i16> - store <8 x i16> %strided.vec, <8 x i16>* %S - ret void -} - -define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX512F-LABEL: shuffle_v64i8_to_v8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm0, (%rsi) -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: shuffle_v64i8_to_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm0, (%rsi) -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: shuffle_v64i8_to_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BW-NEXT: vmovq %xmm0, (%rsi) -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi) -; AVX512VBMI-NEXT: retq -; -; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224] -; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi) -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %vec = load <64 x i8>, <64 x i8>* %L - %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> - store <8 x i8> %strided.vec, <8 x i8>* %S - ret void -} - -define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { -; AVX512-LABEL: trunc_v8i64_to_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 -; AVX512-NEXT: vpmovqb %zmm0, (%rsi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %vec = load <64 x i8>, <64 x i8>* %L - %bc = bitcast <64 x i8> %vec to <8 x i64> - %strided.vec = trunc <8 x i64> %bc to <8 x i8> - store <8 x i8> %strided.vec, <8 x i8>* %S - ret void -} - -define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) { -; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 -; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512VBMI-NEXT: vzeroupper -; AVX512VBMI-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61] -; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 -; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> - ret <16 x i8> %res -} - -define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) { -; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u] -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u] -; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62] -; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0 -; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512VBMI-NEXT: vzeroupper -; AVX512VBMI-NEXT: retq -; -; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62] -; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 -; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512VBMIVL-NEXT: vzeroupper -; AVX512VBMIVL-NEXT: retq - %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> - ret <16 x i8> %res -} - -define <4 x double> @PR34175(<32 x i16>* %p) { -; AVX512F-LABEL: PR34175: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: PR34175: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3] -; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: PR34175: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: PR34175: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768] -; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 -; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512BWVL-NEXT: retq -; -; AVX512VBMI-LABEL: PR34175: -; AVX512VBMI: # %bb.0: -; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1 -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512VBMI-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512VBMI-NEXT: retq -; -; AVX512VBMIVL-LABEL: PR34175: -; AVX512VBMIVL: # %bb.0: -; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768] -; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1 -; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX512VBMIVL-NEXT: retq - %v = load <32 x i16>, <32 x i16>* %p, align 2 - %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> - %tofp = uitofp <4 x i16> %shuf to <4 x double> - ret <4 x double> %tofp -} - -define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind { -; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %truncated = trunc <8 x i64> %vec to <8 x i8> - %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> - ret <16 x i8> %result -} - diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll index dad00e8c48d..e43216dcdbd 100644 --- a/test/CodeGen/X86/vec_cast2.ll +++ b/test/CodeGen/X86/vec_cast2.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx | FileCheck %s -; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE define <8 x float> @cvt_v8i8_v8f32(<8 x i8> %src) { ; CHECK-LABEL: cvt_v8i8_v8f32: @@ -11,15 +10,6 @@ define <8 x float> @cvt_v8i8_v8f32(<8 x i8> %src) { ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v8i8_v8f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm1 -; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-WIDE-NEXT: retl %res = sitofp <8 x i8> %src to <8 x float> ret <8 x float> %res } @@ -33,15 +23,6 @@ define <8 x float> @cvt_v8i16_v8f32(<8 x i16> %src) { ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v8i16_v8f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovsxwd %xmm0, %xmm1 -; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; CHECK-WIDE-NEXT: vpmovsxwd %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-WIDE-NEXT: retl %res = sitofp <8 x i16> %src to <8 x float> ret <8 x float> %res } @@ -52,12 +33,6 @@ define <4 x float> @cvt_v4i8_v4f32(<4 x i8> %src) { ; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v4i8_v4f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = sitofp <4 x i8> %src to <4 x float> ret <4 x float> %res } @@ -68,12 +43,6 @@ define <4 x float> @cvt_v4i16_v4f32(<4 x i16> %src) { ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v4i16_v4f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovsxwd %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = sitofp <4 x i16> %src to <4 x float> ret <4 x float> %res } @@ -87,15 +56,6 @@ define <8 x float> @cvt_v8u8_v8f32(<8 x i8> %src) { ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v8u8_v8f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-WIDE-NEXT: retl %res = uitofp <8 x i8> %src to <8 x float> ret <8 x float> %res } @@ -109,15 +69,6 @@ define <8 x float> @cvt_v8u16_v8f32(<8 x i16> %src) { ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v8u16_v8f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; CHECK-WIDE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0 -; CHECK-WIDE-NEXT: retl %res = uitofp <8 x i16> %src to <8 x float> ret <8 x float> %res } @@ -128,12 +79,6 @@ define <4 x float> @cvt_v4u8_v4f32(<4 x i8> %src) { ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v4u8_v4f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = uitofp <4 x i8> %src to <4 x float> ret <4 x float> %res } @@ -144,12 +89,6 @@ define <4 x float> @cvt_v4u16_v4f32(<4 x i16> %src) { ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v4u16_v4f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = uitofp <4 x i16> %src to <4 x float> ret <4 x float> %res } @@ -163,15 +102,6 @@ define <8 x i8> @cvt_v8f32_v8i8(<8 x float> %src) { ; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v8f32_v8i8: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vzeroupper -; CHECK-WIDE-NEXT: retl %res = fptosi <8 x float> %src to <8 x i8> ret <8 x i8> %res } @@ -184,14 +114,6 @@ define <8 x i16> @cvt_v8f32_v8i16(<8 x float> %src) { ; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v8f32_v8i16: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vzeroupper -; CHECK-WIDE-NEXT: retl %res = fptosi <8 x float> %src to <8 x i16> ret <8 x i16> %res } @@ -202,12 +124,6 @@ define <4 x i8> @cvt_v4f32_v4i8(<4 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v4f32_v4i8: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; CHECK-WIDE-NEXT: retl %res = fptosi <4 x float> %src to <4 x i8> ret <4 x i8> %res } @@ -218,12 +134,6 @@ define <4 x i16> @cvt_v4f32_v4i16(<4 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v4f32_v4i16: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = fptosi <4 x float> %src to <4 x i16> ret <4 x i16> %res } @@ -237,15 +147,6 @@ define <8 x i8> @cvt_v8f32_v8u8(<8 x float> %src) { ; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v8f32_v8u8: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vzeroupper -; CHECK-WIDE-NEXT: retl %res = fptoui <8 x float> %src to <8 x i8> ret <8 x i8> %res } @@ -258,14 +159,6 @@ define <8 x i16> @cvt_v8f32_v8u16(<8 x float> %src) { ; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v8f32_v8u16: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0 -; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1 -; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vzeroupper -; CHECK-WIDE-NEXT: retl %res = fptoui <8 x float> %src to <8 x i16> ret <8 x i16> %res } @@ -276,12 +169,6 @@ define <4 x i8> @cvt_v4f32_v4u8(<4 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v4f32_v4u8: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; CHECK-WIDE-NEXT: retl %res = fptoui <4 x float> %src to <4 x i8> ret <4 x i8> %res } @@ -292,12 +179,6 @@ define <4 x i16> @cvt_v4f32_v4u16(<4 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v4f32_v4u16: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = fptoui <4 x float> %src to <4 x i16> ret <4 x i16> %res } diff --git a/test/CodeGen/X86/vec_cast3.ll b/test/CodeGen/X86/vec_cast3.ll index 4148f7eb0f4..82b8c00c0a2 100644 --- a/test/CodeGen/X86/vec_cast3.ll +++ b/test/CodeGen/X86/vec_cast3.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx | FileCheck %s -; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE define <2 x float> @cvt_v2i8_v2f32(<2 x i8> %src) { ; CHECK-LABEL: cvt_v2i8_v2f32: @@ -8,12 +7,6 @@ define <2 x float> @cvt_v2i8_v2f32(<2 x i8> %src) { ; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v2i8_v2f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = sitofp <2 x i8> %src to <2 x float> ret <2 x float> %res } @@ -24,12 +17,6 @@ define <2 x float> @cvt_v2i16_v2f32(<2 x i16> %src) { ; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v2i16_v2f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovsxwd %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = sitofp <2 x i16> %src to <2 x float> ret <2 x float> %res } @@ -39,11 +26,6 @@ define <2 x float> @cvt_v2i32_v2f32(<2 x i32> %src) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v2i32_v2f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = sitofp <2 x i32> %src to <2 x float> ret <2 x float> %res } @@ -54,12 +36,6 @@ define <2 x float> @cvt_v2u8_v2f32(<2 x i8> %src) { ; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v2u8_v2f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = uitofp <2 x i8> %src to <2 x float> ret <2 x float> %res } @@ -70,12 +46,6 @@ define <2 x float> @cvt_v2u16_v2f32(<2 x i16> %src) { ; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v2u16_v2f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = uitofp <2 x i16> %src to <2 x float> ret <2 x float> %res } @@ -89,15 +59,6 @@ define <2 x float> @cvt_v2u32_v2f32(<2 x i32> %src) { ; CHECK-NEXT: vsubpd %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v2u32_v2f32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; CHECK-WIDE-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15] -; CHECK-WIDE-NEXT: vpor %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vsubpd %xmm1, %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vcvtpd2ps %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = uitofp <2 x i32> %src to <2 x float> ret <2 x float> %res } @@ -108,12 +69,6 @@ define <2 x i8> @cvt_v2f32_v2i8(<2 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v2f32_v2i8: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; CHECK-WIDE-NEXT: retl %res = fptosi <2 x float> %src to <2 x i8> ret <2 x i8> %res } @@ -124,12 +79,6 @@ define <2 x i16> @cvt_v2f32_v2i16(<2 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v2f32_v2i16: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-WIDE-NEXT: retl %res = fptosi <2 x float> %src to <2 x i16> ret <2 x i16> %res } @@ -139,11 +88,6 @@ define <2 x i32> @cvt_v2f32_v2i32(<2 x float> %src) { ; CHECK: ## %bb.0: ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v2f32_v2i32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-WIDE-NEXT: retl %res = fptosi <2 x float> %src to <2 x i32> ret <2 x i32> %res } @@ -154,12 +98,6 @@ define <2 x i8> @cvt_v2f32_v2u8(<2 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v2f32_v2u8: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; CHECK-WIDE-NEXT: retl %res = fptoui <2 x float> %src to <2 x i8> ret <2 x i8> %res } @@ -170,12 +108,6 @@ define <2 x i16> @cvt_v2f32_v2u16(<2 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v2f32_v2u16: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; CHECK-WIDE-NEXT: retl %res = fptoui <2 x float> %src to <2 x i16> ret <2 x i16> %res } @@ -191,17 +123,6 @@ define <2 x i32> @cvt_v2f32_v2u32(<2 x float> %src) { ; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 ; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: cvt_v2f32_v2u32: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; CHECK-WIDE-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; CHECK-WIDE-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; CHECK-WIDE-NEXT: vcvttps2dq %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vxorps LCPI11_1, %xmm1, %xmm1 -; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0 -; CHECK-WIDE-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; CHECK-WIDE-NEXT: retl %res = fptoui <2 x float> %src to <2 x i32> ret <2 x i32> %res } @@ -214,14 +135,6 @@ define <32 x i8> @PR40146(<4 x i64> %x) { ; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; CHECK-NEXT: retl -; -; CHECK-WIDE-LABEL: PR40146: -; CHECK-WIDE: ## %bb.0: -; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; CHECK-WIDE-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; CHECK-WIDE-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; CHECK-WIDE-NEXT: retl %perm = shufflevector <4 x i64> %x, <4 x i64> undef, <4 x i32> %t1 = bitcast <4 x i64> %perm to <32 x i8> %t2 = shufflevector <32 x i8> %t1, <32 x i8> , <32 x i32> diff --git a/test/CodeGen/X86/vec_fp_to_int-widen.ll b/test/CodeGen/X86/vec_fp_to_int-widen.ll deleted file mode 100644 index 9541d3834a4..00000000000 --- a/test/CodeGen/X86/vec_fp_to_int-widen.ll +++ /dev/null @@ -1,2794 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VLDQ -; -; 32-bit tests to make sure we're not doing anything stupid. -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 - -; -; Double to Signed Integer -; - -define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) { -; SSE-LABEL: fptosi_2f64_to_2i64: -; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq -; -; VEX-LABEL: fptosi_2f64_to_2i64: -; VEX: # %bb.0: -; VEX-NEXT: vcvttsd2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm1 -; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; VEX-NEXT: vcvttsd2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f64_to_2i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttsd2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vcvttsd2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f64_to_2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f64_to_2i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f64_to_2i64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = fptosi <2 x double> %a to <2 x i64> - ret <2 x i64> %cvt -} - -define <4 x i32> @fptosi_2f64_to_4i32(<2 x double> %a) { -; SSE-LABEL: fptosi_2f64_to_4i32: -; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_2f64_to_4i32: -; AVX: # %bb.0: -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: retq - %cvt = fptosi <2 x double> %a to <2 x i32> - %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> - ret <4 x i32> %ext -} - -define <2 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) { -; SSE-LABEL: fptosi_2f64_to_2i32: -; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_2f64_to_2i32: -; AVX: # %bb.0: -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: retq - %cvt = fptosi <2 x double> %a to <2 x i32> - ret <2 x i32> %cvt -} - -define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) { -; SSE-LABEL: fptosi_4f64_to_2i32: -; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_4f64_to_2i32: -; AVX: # %bb.0: -; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq - %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> - %cvt = fptosi <4 x double> %ext to <4 x i32> - ret <4 x i32> %cvt -} - -define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) { -; SSE-LABEL: fptosi_4f64_to_4i64: -; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptosi_4f64_to_4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vcvttsd2si %xmm1, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX1-NEXT: vcvttsd2si %xmm1, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vcvttsd2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vcvttsd2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptosi_4f64_to_4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vcvttsd2si %xmm1, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX2-NEXT: vcvttsd2si %xmm1, %rax -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vcvttsd2si %xmm0, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vcvttsd2si %xmm0, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptosi_4f64_to_4i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vcvttsd2si %xmm1, %rax -; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512F-NEXT: vcvttsd2si %xmm1, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-NEXT: vcvttsd2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vcvttsd2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_4f64_to_4i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_4f64_to_4i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_4f64_to_4i64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0 -; AVX512VLDQ-NEXT: retq - %cvt = fptosi <4 x double> %a to <4 x i64> - ret <4 x i64> %cvt -} - -define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) { -; SSE-LABEL: fptosi_4f64_to_4i32: -; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 -; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_4f64_to_4i32: -; AVX: # %bb.0: -; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq - %cvt = fptosi <4 x double> %a to <4 x i32> - ret <4 x i32> %cvt -} - -; -; Double to Unsigned Integer -; - -define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) { -; SSE-LABEL: fptoui_2f64_to_2i64: -; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: subsd %xmm2, %xmm1 -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rdx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movapd %xmm0, %xmm3 -; SSE-NEXT: subsd %xmm2, %xmm3 -; SSE-NEXT: cvttsd2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: ucomisd %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq -; -; VEX-LABEL: fptoui_2f64_to_2i64: -; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttsd2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rdx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttsd2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttsd2si %xmm0, %rcx -; VEX-NEXT: vucomisd %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f64_to_2i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f64_to_2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f64_to_2i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f64_to_2i64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = fptoui <2 x double> %a to <2 x i64> - ret <2 x i64> %cvt -} - -define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) { -; SSE-LABEL: fptoui_2f64_to_4i32: -; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movd %ecx, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE-NEXT: retq -; -; AVX1-LABEL: fptoui_2f64_to_4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_2f64_to_4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpackssdw %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f64_to_4i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 -; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f64_to_4i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f64_to_4i32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f64_to_4i32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = fptoui <2 x double> %a to <2 x i32> - %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> - ret <4 x i32> %ext -} - -define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) { -; SSE-LABEL: fptoui_2f64_to_2i32: -; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptoui_2f64_to_2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_2f64_to_2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f64_to_2i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f64_to_2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f64_to_2i32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f64_to_2i32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = fptoui <2 x double> %a to <2 x i32> - %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> - ret <4 x i32> %ext -} - -define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) { -; SSE-LABEL: fptoui_4f64_to_2i32: -; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; SSE-NEXT: retq -; -; AVX1-LABEL: fptoui_4f64_to_2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd %xmm0, %xmm0 -; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm1 -; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_4f64_to_2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovapd %xmm0, %xmm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptoui_4f64_to_2i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps %xmm0, %xmm0 -; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_4f64_to_2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps %xmm0, %xmm0 -; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_4f64_to_2i32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0 -; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vmovaps %xmm0, %xmm0 -; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq - %ext = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> - %cvt = fptoui <4 x double> %ext to <4 x i32> - ret <4 x i32> %cvt -} - -define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) { -; SSE-LABEL: fptoui_4f64_to_4i64: -; SSE: # %bb.0: -; SSE-NEXT: movapd %xmm0, %xmm2 -; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero -; SSE-NEXT: subsd %xmm3, %xmm0 -; SSE-NEXT: cvttsd2si %xmm0, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm2, %rdx -; SSE-NEXT: ucomisd %xmm3, %xmm2 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: movapd %xmm2, %xmm4 -; SSE-NEXT: subsd %xmm3, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm2, %rdx -; SSE-NEXT: ucomisd %xmm3, %xmm2 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm2 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: subsd %xmm3, %xmm2 -; SSE-NEXT: cvttsd2si %xmm2, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rdx -; SSE-NEXT: ucomisd %xmm3, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: movapd %xmm1, %xmm4 -; SSE-NEXT: subsd %xmm3, %xmm4 -; SSE-NEXT: cvttsd2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: ucomisd %xmm3, %xmm1 -; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptoui_4f64_to_4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vcvttsd2si %xmm3, %rax -; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttsd2si %xmm2, %rdx -; AVX1-NEXT: vucomisd %xmm1, %xmm2 -; AVX1-NEXT: cmovaeq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm4 -; AVX1-NEXT: vcvttsd2si %xmm4, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttsd2si %xmm2, %rdx -; AVX1-NEXT: vucomisd %xmm1, %xmm2 -; AVX1-NEXT: cmovaeq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vcvttsd2si %xmm3, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttsd2si %xmm0, %rdx -; AVX1-NEXT: vucomisd %xmm1, %xmm0 -; AVX1-NEXT: cmovaeq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vcvttsd2si %xmm4, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttsd2si %xmm0, %rcx -; AVX1-NEXT: vucomisd %xmm1, %xmm0 -; AVX1-NEXT: cmovaeq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_4f64_to_4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vcvttsd2si %xmm3, %rax -; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttsd2si %xmm2, %rdx -; AVX2-NEXT: vucomisd %xmm1, %xmm2 -; AVX2-NEXT: cmovaeq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm4 -; AVX2-NEXT: vcvttsd2si %xmm4, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttsd2si %xmm2, %rdx -; AVX2-NEXT: vucomisd %xmm1, %xmm2 -; AVX2-NEXT: cmovaeq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vcvttsd2si %xmm3, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttsd2si %xmm0, %rdx -; AVX2-NEXT: vucomisd %xmm1, %xmm0 -; AVX2-NEXT: cmovaeq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm4 -; AVX2-NEXT: vcvttsd2si %xmm4, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttsd2si %xmm0, %rcx -; AVX2-NEXT: vucomisd %xmm1, %xmm0 -; AVX2-NEXT: cmovaeq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptoui_4f64_to_4i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax -; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_4f64_to_4i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_4f64_to_4i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_4f64_to_4i64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2uqq %ymm0, %ymm0 -; AVX512VLDQ-NEXT: retq - %cvt = fptoui <4 x double> %a to <4 x i64> - ret <4 x i64> %cvt -} - -define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) { -; SSE-LABEL: fptoui_4f64_to_4i32: -; SSE: # %bb.0: -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %eax, %xmm2 -; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: cvttsd2si %xmm1, %rax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttsd2si %xmm0, %rax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptoui_4f64_to_4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm1 -; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_4f64_to_4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9] -; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptoui_4f64_to_4i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_4f64_to_4i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_4f64_to_4i32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_4f64_to_4i32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq - %cvt = fptoui <4 x double> %a to <4 x i32> - ret <4 x i32> %cvt -} - -; -; Float to Signed Integer -; - -define <2 x i32> @fptosi_2f32_to_2i32(<2 x float> %a) { -; SSE-LABEL: fptosi_2f32_to_2i32: -; SSE: # %bb.0: -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_2f32_to_2i32: -; AVX: # %bb.0: -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: retq - %cvt = fptosi <2 x float> %a to <2 x i32> - ret <2 x i32> %cvt -} - -define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) { -; SSE-LABEL: fptosi_4f32_to_4i32: -; SSE: # %bb.0: -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_4f32_to_4i32: -; AVX: # %bb.0: -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: retq - %cvt = fptosi <4 x float> %a to <4 x i32> - ret <4 x i32> %cvt -} - -define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) { -; SSE-LABEL: fptosi_2f32_to_2i64: -; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq -; -; VEX-LABEL: fptosi_2f32_to_2i64: -; VEX: # %bb.0: -; VEX-NEXT: vcvttss2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm1 -; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; VEX-NEXT: vcvttss2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f32_to_2i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttss2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f32_to_2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttss2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f32_to_2i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> - %cvt = fptosi <2 x float> %shuf to <2 x i64> - ret <2 x i64> %cvt -} - -define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) { -; SSE-LABEL: fptosi_4f32_to_2i64: -; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq -; -; VEX-LABEL: fptosi_4f32_to_2i64: -; VEX: # %bb.0: -; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; VEX-NEXT: vcvttss2si %xmm1, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vmovq %rax, %xmm1 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_4f32_to_2i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2si %xmm1, %rax -; AVX512F-NEXT: vcvttss2si %xmm0, %rcx -; AVX512F-NEXT: vmovq %rcx, %xmm0 -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_4f32_to_2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2si %xmm1, %rax -; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx -; AVX512VL-NEXT: vmovq %rcx, %xmm0 -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_4f32_to_2i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq - %cvt = fptosi <4 x float> %a to <4 x i64> - %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> - ret <2 x i64> %shuf -} - -define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) { -; SSE-LABEL: fptosi_8f32_to_8i32: -; SSE: # %bb.0: -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_8f32_to_8i32: -; AVX: # %bb.0: -; AVX-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX-NEXT: retq - %cvt = fptosi <8 x float> %a to <8 x i32> - ret <8 x i32> %cvt -} - -define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) { -; SSE-LABEL: fptosi_4f32_to_4i64: -; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptosi_4f32_to_4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT: vcvttss2si %xmm1, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1-NEXT: vcvttss2si %xmm2, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vcvttss2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vcvttss2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptosi_4f32_to_4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvttss2si %xmm1, %rax -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX2-NEXT: vcvttss2si %xmm2, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vcvttss2si %xmm0, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvttss2si %xmm0, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptosi_4f32_to_4i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvttss2si %xmm1, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512F-NEXT: vcvttss2si %xmm2, %rax -; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-NEXT: vcvttss2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_4f32_to_4i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvttss2si %xmm1, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512VL-NEXT: vcvttss2si %xmm2, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512VL-NEXT: vcvttss2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_4f32_to_4i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_4f32_to_4i64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0 -; AVX512VLDQ-NEXT: retq - %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> - %cvt = fptosi <4 x float> %shuf to <4 x i64> - ret <4 x i64> %cvt -} - -define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) { -; SSE-LABEL: fptosi_8f32_to_4i64: -; SSE: # %bb.0: -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3] -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptosi_8f32_to_4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX1-NEXT: vcvttss2si %xmm1, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX1-NEXT: vcvttss2si %xmm2, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX1-NEXT: vcvttss2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm2 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vcvttss2si %xmm0, %rax -; AVX1-NEXT: vmovq %rax, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptosi_8f32_to_4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX2-NEXT: vcvttss2si %xmm1, %rax -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX2-NEXT: vcvttss2si %xmm2, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vcvttss2si %xmm0, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vcvttss2si %xmm0, %rax -; AVX2-NEXT: vmovq %rax, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptosi_8f32_to_4i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2si %xmm1, %rax -; AVX512F-NEXT: vcvttss2si %xmm0, %rcx -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvttss2si %xmm1, %rdx -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvttss2si %xmm0, %rsi -; AVX512F-NEXT: vmovq %rsi, %xmm0 -; AVX512F-NEXT: vmovq %rdx, %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vmovq %rcx, %xmm1 -; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_8f32_to_4i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2si %xmm1, %rax -; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512VL-NEXT: vcvttss2si %xmm1, %rdx -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvttss2si %xmm0, %rsi -; AVX512VL-NEXT: vmovq %rsi, %xmm0 -; AVX512VL-NEXT: vmovq %rdx, %xmm1 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vmovq %rcx, %xmm1 -; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_8f32_to_4i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_8f32_to_4i64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2qq %ymm0, %zmm0 -; AVX512VLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512VLDQ-NEXT: retq - %cvt = fptosi <8 x float> %a to <8 x i64> - %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> - ret <4 x i64> %shuf -} - -; -; Float to Unsigned Integer -; - -define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) { -; SSE-LABEL: fptoui_2f32_to_2i32: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: cmpltps %xmm2, %xmm1 -; SSE-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE-NEXT: subps %xmm2, %xmm0 -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptoui_2f32_to_2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_2f32_to_2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f32_to_2i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f32_to_2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f32_to_2i32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = fptoui <2 x float> %a to <2 x i32> - ret <2 x i32> %cvt -} - -define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) { -; SSE-LABEL: fptoui_4f32_to_4i32: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: cmpltps %xmm2, %xmm1 -; SSE-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE-NEXT: subps %xmm2, %xmm0 -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 -; SSE-NEXT: andps %xmm1, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm1 -; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptoui_4f32_to_4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_4f32_to_4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptoui_4f32_to_4i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_4f32_to_4i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_4f32_to_4i32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_4f32_to_4i32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = fptoui <4 x float> %a to <4 x i32> - ret <4 x i32> %cvt -} - -define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) { -; SSE-LABEL: fptoui_2f32_to_2i64: -; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: subss %xmm2, %xmm1 -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rdx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subss %xmm2, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rcx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq -; -; VEX-LABEL: fptoui_2f32_to_2i64: -; VEX: # %bb.0: -; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttss2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rdx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttss2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rcx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f32_to_2i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttss2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f32_to_2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f32_to_2i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> - %cvt = fptoui <2 x float> %shuf to <2 x i64> - ret <2 x i64> %cvt -} - -define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) { -; SSE-LABEL: fptoui_4f32_to_2i64: -; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm0, %xmm1 -; SSE-NEXT: subss %xmm2, %xmm1 -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rdx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: subss %xmm2, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm0, %rcx -; SSE-NEXT: ucomiss %xmm2, %xmm0 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq -; -; VEX-LABEL: fptoui_4f32_to_2i64: -; VEX: # %bb.0: -; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; VEX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; VEX-NEXT: vsubss %xmm2, %xmm1, %xmm3 -; VEX-NEXT: vcvttss2si %xmm3, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm1, %rdx -; VEX-NEXT: vucomiss %xmm2, %xmm1 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vsubss %xmm2, %xmm0, %xmm1 -; VEX-NEXT: vcvttss2si %xmm1, %rax -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rcx -; VEX-NEXT: vucomiss %xmm2, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vmovq %rdx, %xmm1 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptoui_4f32_to_2i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2usi %xmm1, %rax -; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx -; AVX512F-NEXT: vmovq %rcx, %xmm0 -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_4f32_to_2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax -; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx -; AVX512VL-NEXT: vmovq %rcx, %xmm0 -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_4f32_to_2i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0 -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq - %cvt = fptoui <4 x float> %a to <4 x i64> - %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> - ret <2 x i64> %shuf -} - -define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) { -; SSE-LABEL: fptoui_8f32_to_8i32: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: cmpltps %xmm4, %xmm2 -; SSE-NEXT: cvttps2dq %xmm0, %xmm3 -; SSE-NEXT: subps %xmm4, %xmm0 -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648] -; SSE-NEXT: xorps %xmm5, %xmm0 -; SSE-NEXT: andps %xmm2, %xmm3 -; SSE-NEXT: andnps %xmm0, %xmm2 -; SSE-NEXT: orps %xmm3, %xmm2 -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: cmpltps %xmm4, %xmm3 -; SSE-NEXT: cvttps2dq %xmm1, %xmm0 -; SSE-NEXT: subps %xmm4, %xmm1 -; SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE-NEXT: xorps %xmm5, %xmm1 -; SSE-NEXT: andps %xmm3, %xmm0 -; SSE-NEXT: andnps %xmm1, %xmm3 -; SSE-NEXT: orps %xmm0, %xmm3 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptoui_8f32_to_8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 -; AVX1-NEXT: vsubps %ymm1, %ymm0, %ymm1 -; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_8f32_to_8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] -; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptoui_8f32_to_8i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_8f32_to_8i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttps2udq %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_8f32_to_8i32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_8f32_to_8i32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2udq %ymm0, %ymm0 -; AVX512VLDQ-NEXT: retq - %cvt = fptoui <8 x float> %a to <8 x i32> - ret <8 x i32> %cvt -} - -define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) { -; SSE-LABEL: fptoui_4f32_to_4i64: -; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: subss %xmm1, %xmm2 -; SSE-NEXT: cvttss2si %xmm2, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm0, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm3, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm3 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm3, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm3 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: ucomiss %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptoui_4f32_to_4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vcvttss2si %xmm3, %rax -; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm2, %rdx -; AVX1-NEXT: vucomiss %xmm1, %xmm2 -; AVX1-NEXT: cmovaeq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vcvttss2si %xmm4, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm3, %rdx -; AVX1-NEXT: vucomiss %xmm1, %xmm3 -; AVX1-NEXT: cmovaeq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vcvttss2si %xmm3, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm0, %rdx -; AVX1-NEXT: vucomiss %xmm1, %xmm0 -; AVX1-NEXT: cmovaeq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vcvttss2si %xmm4, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm0, %rcx -; AVX1-NEXT: vucomiss %xmm1, %xmm0 -; AVX1-NEXT: cmovaeq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_4f32_to_4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vcvttss2si %xmm3, %rax -; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm2, %rdx -; AVX2-NEXT: vucomiss %xmm1, %xmm2 -; AVX2-NEXT: cmovaeq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vcvttss2si %xmm4, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm3, %rdx -; AVX2-NEXT: vucomiss %xmm1, %xmm3 -; AVX2-NEXT: cmovaeq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vcvttss2si %xmm3, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm0, %rdx -; AVX2-NEXT: vucomiss %xmm1, %xmm0 -; AVX2-NEXT: cmovaeq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4 -; AVX2-NEXT: vcvttss2si %xmm4, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm0, %rcx -; AVX2-NEXT: vucomiss %xmm1, %xmm0 -; AVX2-NEXT: cmovaeq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptoui_4f32_to_4i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvttss2usi %xmm1, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512F-NEXT: vcvttss2usi %xmm2, %rax -; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512F-NEXT: vcvttss2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_4f32_to_4i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0] -; AVX512VL-NEXT: vcvttss2usi %xmm2, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_4f32_to_4i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_4f32_to_4i64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0 -; AVX512VLDQ-NEXT: retq - %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> - %cvt = fptoui <4 x float> %shuf to <4 x i64> - ret <4 x i64> %cvt -} - -define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) { -; SSE-LABEL: fptoui_8f32_to_4i64: -; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm0, %xmm2 -; SSE-NEXT: subss %xmm1, %xmm2 -; SSE-NEXT: cvttss2si %xmm2, %rcx -; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm0, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm2 -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm3, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm3 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: movaps %xmm0, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3] -; SSE-NEXT: movaps %xmm3, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm3, %rdx -; SSE-NEXT: ucomiss %xmm1, %xmm3 -; SSE-NEXT: cmovaeq %rcx, %rdx -; SSE-NEXT: movq %rdx, %xmm3 -; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: subss %xmm1, %xmm4 -; SSE-NEXT: cvttss2si %xmm4, %rcx -; SSE-NEXT: xorq %rax, %rcx -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: ucomiss %xmm1, %xmm0 -; SSE-NEXT: cmovaeq %rcx, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptoui_8f32_to_4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vcvttss2si %xmm3, %rax -; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm2, %rdx -; AVX1-NEXT: vucomiss %xmm1, %xmm2 -; AVX1-NEXT: cmovaeq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm2 -; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vcvttss2si %xmm4, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm3, %rdx -; AVX1-NEXT: vucomiss %xmm1, %xmm3 -; AVX1-NEXT: cmovaeq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vcvttss2si %xmm3, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm0, %rdx -; AVX1-NEXT: vucomiss %xmm1, %xmm0 -; AVX1-NEXT: cmovaeq %rax, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm3 -; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4 -; AVX1-NEXT: vcvttss2si %xmm4, %rax -; AVX1-NEXT: xorq %rcx, %rax -; AVX1-NEXT: vcvttss2si %xmm0, %rcx -; AVX1-NEXT: vucomiss %xmm1, %xmm0 -; AVX1-NEXT: cmovaeq %rax, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_8f32_to_4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3] -; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3 -; AVX2-NEXT: vcvttss2si %xmm3, %rax -; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm2, %rdx -; AVX2-NEXT: vucomiss %xmm1, %xmm2 -; AVX2-NEXT: cmovaeq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm2 -; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0] -; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4 -; AVX2-NEXT: vcvttss2si %xmm4, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm3, %rdx -; AVX2-NEXT: vucomiss %xmm1, %xmm3 -; AVX2-NEXT: cmovaeq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; AVX2-NEXT: vcvttss2si %xmm3, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm0, %rdx -; AVX2-NEXT: vucomiss %xmm1, %xmm0 -; AVX2-NEXT: cmovaeq %rax, %rdx -; AVX2-NEXT: vmovq %rdx, %xmm3 -; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4 -; AVX2-NEXT: vcvttss2si %xmm4, %rax -; AVX2-NEXT: xorq %rcx, %rax -; AVX2-NEXT: vcvttss2si %xmm0, %rcx -; AVX2-NEXT: vucomiss %xmm1, %xmm0 -; AVX2-NEXT: cmovaeq %rax, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: fptoui_8f32_to_4i64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2usi %xmm1, %rax -; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx -; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512F-NEXT: vcvttss2usi %xmm1, %rdx -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvttss2usi %xmm0, %rsi -; AVX512F-NEXT: vmovq %rsi, %xmm0 -; AVX512F-NEXT: vmovq %rdx, %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vmovq %rcx, %xmm1 -; AVX512F-NEXT: vmovq %rax, %xmm2 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_8f32_to_4i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax -; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx -; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0] -; AVX512VL-NEXT: vcvttss2usi %xmm1, %rdx -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvttss2usi %xmm0, %rsi -; AVX512VL-NEXT: vmovq %rsi, %xmm0 -; AVX512VL-NEXT: vmovq %rdx, %xmm1 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vmovq %rcx, %xmm1 -; AVX512VL-NEXT: vmovq %rax, %xmm2 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_8f32_to_4i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_8f32_to_4i64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2uqq %ymm0, %zmm0 -; AVX512VLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512VLDQ-NEXT: retq - %cvt = fptoui <8 x float> %a to <8 x i64> - %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> - ret <4 x i64> %shuf -} - -; -; Constant Folding -; - -define <2 x i64> @fptosi_2f64_to_2i64_const() { -; SSE-LABEL: fptosi_2f64_to_2i64_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615] -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_2f64_to_2i64_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615] -; AVX-NEXT: retq - %cvt = fptosi <2 x double> to <2 x i64> - ret <2 x i64> %cvt -} - -define <4 x i32> @fptosi_2f64_to_2i32_const() { -; SSE-LABEL: fptosi_2f64_to_2i32_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = <4294967295,1,u,u> -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_2f64_to_2i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u> -; AVX-NEXT: retq - %cvt = fptosi <2 x double> to <2 x i32> - %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> - ret <4 x i32> %ext -} - -define <4 x i64> @fptosi_4f64_to_4i64_const() { -; SSE-LABEL: fptosi_4f64_to_4i64_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,18446744073709551613] -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_4f64_to_4i64_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613] -; AVX-NEXT: retq - %cvt = fptosi <4 x double> to <4 x i64> - ret <4 x i64> %cvt -} - -define <4 x i32> @fptosi_4f64_to_4i32_const() { -; SSE-LABEL: fptosi_4f64_to_4i32_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_4f64_to_4i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3] -; AVX-NEXT: retq - %cvt = fptosi <4 x double> to <4 x i32> - ret <4 x i32> %cvt -} - -define <2 x i64> @fptoui_2f64_to_2i64_const() { -; SSE-LABEL: fptoui_2f64_to_2i64_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4] -; SSE-NEXT: retq -; -; AVX-LABEL: fptoui_2f64_to_2i64_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4] -; AVX-NEXT: retq - %cvt = fptoui <2 x double> to <2 x i64> - ret <2 x i64> %cvt -} - -define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) { -; SSE-LABEL: fptoui_2f64_to_2i32_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = <2,4,u,u> -; SSE-NEXT: retq -; -; AVX-LABEL: fptoui_2f64_to_2i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u> -; AVX-NEXT: retq - %cvt = fptoui <2 x double> to <2 x i32> - %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> - ret <4 x i32> %ext -} - -define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) { -; SSE-LABEL: fptoui_4f64_to_4i64_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,8] -; SSE-NEXT: retq -; -; AVX-LABEL: fptoui_4f64_to_4i64_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8] -; AVX-NEXT: retq - %cvt = fptoui <4 x double> to <4 x i64> - ret <4 x i64> %cvt -} - -define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) { -; SSE-LABEL: fptoui_4f64_to_4i32_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4,6,8] -; SSE-NEXT: retq -; -; AVX-LABEL: fptoui_4f64_to_4i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8] -; AVX-NEXT: retq - %cvt = fptoui <4 x double> to <4 x i32> - ret <4 x i32> %cvt -} - -define <4 x i32> @fptosi_4f32_to_4i32_const() { -; SSE-LABEL: fptosi_4f32_to_4i32_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3] -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_4f32_to_4i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3] -; AVX-NEXT: retq - %cvt = fptosi <4 x float> to <4 x i32> - ret <4 x i32> %cvt -} - -define <4 x i64> @fptosi_4f32_to_4i64_const() { -; SSE-LABEL: fptosi_4f32_to_4i64_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,3] -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_4f32_to_4i64_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3] -; AVX-NEXT: retq - %cvt = fptosi <4 x float> to <4 x i64> - ret <4 x i64> %cvt -} - -define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) { -; SSE-LABEL: fptosi_8f32_to_8i32_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295] -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_8f32_to_8i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295] -; AVX-NEXT: retq - %cvt = fptosi <8 x float> to <8 x i32> - ret <8 x i32> %cvt -} - -define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) { -; SSE-LABEL: fptoui_4f32_to_4i32_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6] -; SSE-NEXT: retq -; -; AVX-LABEL: fptoui_4f32_to_4i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6] -; AVX-NEXT: retq - %cvt = fptoui <4 x float> to <4 x i32> - ret <4 x i32> %cvt -} - -define <4 x i64> @fptoui_4f32_to_4i64_const() { -; SSE-LABEL: fptoui_4f32_to_4i64_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [4,8] -; SSE-NEXT: retq -; -; AVX-LABEL: fptoui_4f32_to_4i64_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8] -; AVX-NEXT: retq - %cvt = fptoui <4 x float> to <4 x i64> - ret <4 x i64> %cvt -} - -define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) { -; SSE-LABEL: fptoui_8f32_to_8i32_const: -; SSE: # %bb.0: -; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6] -; SSE-NEXT: movaps {{.*#+}} xmm1 = [8,6,4,1] -; SSE-NEXT: retq -; -; AVX-LABEL: fptoui_8f32_to_8i32_const: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1] -; AVX-NEXT: retq - %cvt = fptoui <8 x float> to <8 x i32> - ret <8 x i32> %cvt -} - -; -; Special Cases -; - -define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind { -; SSE-LABEL: fptosi_2f16_to_4i32: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rax -; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: callq __gnu_f2h_ieee -; SSE-NEXT: movzwl %ax, %edi -; SSE-NEXT: callq __gnu_h2f_ieee -; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill -; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: callq __gnu_f2h_ieee -; SSE-NEXT: movzwl %ax, %edi -; SSE-NEXT: callq __gnu_h2f_ieee -; SSE-NEXT: cvttss2si %xmm0, %eax -; SSE-NEXT: cvttss2si (%rsp), %ecx # 4-byte Folded Reload -; SSE-NEXT: movd %ecx, %xmm0 -; SSE-NEXT: movd %eax, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; SSE-NEXT: popq %rax -; SSE-NEXT: retq -; -; VEX-LABEL: fptosi_2f16_to_4i32: -; VEX: # %bb.0: -; VEX-NEXT: pushq %rax -; VEX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; VEX-NEXT: vmovaps %xmm1, %xmm0 -; VEX-NEXT: callq __gnu_f2h_ieee -; VEX-NEXT: movzwl %ax, %edi -; VEX-NEXT: callq __gnu_h2f_ieee -; VEX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill -; VEX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload -; VEX-NEXT: # xmm0 = mem[0],zero,zero,zero -; VEX-NEXT: callq __gnu_f2h_ieee -; VEX-NEXT: movzwl %ax, %edi -; VEX-NEXT: callq __gnu_h2f_ieee -; VEX-NEXT: vcvttss2si %xmm0, %eax -; VEX-NEXT: vcvttss2si (%rsp), %ecx # 4-byte Folded Reload -; VEX-NEXT: vmovd %ecx, %xmm0 -; VEX-NEXT: vmovd %eax, %xmm1 -; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; VEX-NEXT: popq %rax -; VEX-NEXT: retq -; -; AVX512-LABEL: fptosi_2f16_to_4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 -; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 -; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 -; AVX512-NEXT: vcvttss2si %xmm0, %eax -; AVX512-NEXT: vcvttss2si %xmm1, %ecx -; AVX512-NEXT: vmovd %ecx, %xmm0 -; AVX512-NEXT: vmovd %eax, %xmm1 -; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512-NEXT: retq - %cvt = fptosi <2 x half> %a to <2 x i32> - %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> - ret <4 x i32> %ext -} - -define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind { -; SSE-LABEL: fptosi_2f80_to_4i32: -; SSE: # %bb.0: -; SSE-NEXT: fldt {{[0-9]+}}(%rsp) -; SSE-NEXT: fldt {{[0-9]+}}(%rsp) -; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: orl $3072, %eax # imm = 0xC00 -; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) -; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp) -; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax -; SSE-NEXT: orl $3072, %eax # imm = 0xC00 -; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp) -; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp) -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_2f80_to_4i32: -; AVX: # %bb.0: -; AVX-NEXT: fldt {{[0-9]+}}(%rsp) -; AVX-NEXT: fldt {{[0-9]+}}(%rsp) -; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) -; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX-NEXT: retq - %cvt = fptosi <2 x x86_fp80> %a to <2 x i32> - %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> - ret <4 x i32> %ext -} - -define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { -; SSE-LABEL: fptosi_2f128_to_4i32: -; SSE: # %bb.0: -; SSE-NEXT: pushq %rbp -; SSE-NEXT: pushq %r14 -; SSE-NEXT: pushq %rbx -; SSE-NEXT: movq %rcx, %r14 -; SSE-NEXT: movq %rdx, %rbx -; SSE-NEXT: callq __fixtfsi -; SSE-NEXT: movl %eax, %ebp -; SSE-NEXT: movq %rbx, %rdi -; SSE-NEXT: movq %r14, %rsi -; SSE-NEXT: callq __fixtfsi -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: movd %ebp, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; SSE-NEXT: popq %rbx -; SSE-NEXT: popq %r14 -; SSE-NEXT: popq %rbp -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_2f128_to_4i32: -; AVX: # %bb.0: -; AVX-NEXT: pushq %rbp -; AVX-NEXT: pushq %r14 -; AVX-NEXT: pushq %rbx -; AVX-NEXT: movq %rcx, %r14 -; AVX-NEXT: movq %rdx, %rbx -; AVX-NEXT: callq __fixtfsi -; AVX-NEXT: movl %eax, %ebp -; AVX-NEXT: movq %rbx, %rdi -; AVX-NEXT: movq %r14, %rsi -; AVX-NEXT: callq __fixtfsi -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vmovd %ebp, %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX-NEXT: popq %rbx -; AVX-NEXT: popq %r14 -; AVX-NEXT: popq %rbp -; AVX-NEXT: retq - %cvt = fptosi <2 x fp128> %a to <2 x i32> - %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> - ret <4 x i32> %ext -} - -define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) { -; SSE-LABEL: fptosi_2f32_to_2i8: -; SSE: # %bb.0: -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_2f32_to_2i8: -; AVX: # %bb.0: -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq - %cvt = fptosi <2 x float> %a to <2 x i8> - ret <2 x i8> %cvt -} - -define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) { -; SSE-LABEL: fptosi_2f32_to_2i16: -; SSE: # %bb.0: -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_2f32_to_2i16: -; AVX: # %bb.0: -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: retq - %cvt = fptosi <2 x float> %a to <2 x i16> - ret <2 x i16> %cvt -} - -define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) { -; SSE-LABEL: fptoui_2f32_to_2i8: -; SSE: # %bb.0: -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: fptoui_2f32_to_2i8: -; AVX: # %bb.0: -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq - %cvt = fptoui <2 x float> %a to <2 x i8> - ret <2 x i8> %cvt -} - -define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) { -; SSE-LABEL: fptoui_2f32_to_2i16: -; SSE: # %bb.0: -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: retq -; -; AVX-LABEL: fptoui_2f32_to_2i16: -; AVX: # %bb.0: -; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: retq - %cvt = fptoui <2 x float> %a to <2 x i16> - ret <2 x i16> %cvt -} - -define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) { -; SSE-LABEL: fptosi_2f64_to_2i8: -; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_2f64_to_2i8: -; AVX: # %bb.0: -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq - %cvt = fptosi <2 x double> %a to <2 x i8> - ret <2 x i8> %cvt -} - -define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) { -; SSE-LABEL: fptosi_2f64_to_2i16: -; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: retq -; -; AVX-LABEL: fptosi_2f64_to_2i16: -; AVX: # %bb.0: -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: retq - %cvt = fptosi <2 x double> %a to <2 x i16> - ret <2 x i16> %cvt -} - -define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) { -; SSE-LABEL: fptoui_2f64_to_2i8: -; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: fptoui_2f64_to_2i8: -; AVX: # %bb.0: -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX-NEXT: retq - %cvt = fptoui <2 x double> %a to <2 x i8> - ret <2 x i8> %cvt -} - -define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) { -; SSE-LABEL: fptoui_2f64_to_2i16: -; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: retq -; -; AVX-LABEL: fptoui_2f64_to_2i16: -; AVX: # %bb.0: -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX-NEXT: retq - %cvt = fptoui <2 x double> %a to <2 x i16> - ret <2 x i16> %cvt -} - -define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) { -; SSE-LABEL: fptosi_8f64_to_8i16: -; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 -; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 -; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: packssdw %xmm2, %xmm0 -; SSE-NEXT: retq -; -; VEX-LABEL: fptosi_8f64_to_8i16: -; VEX: # %bb.0: -; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1 -; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; VEX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vzeroupper -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_8f64_to_8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_8f64_to_8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_8f64_to_8i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_8f64_to_8i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq - %cvt = fptosi <8 x double> %a to <8 x i16> - ret <8 x i16> %cvt -} - -define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) { -; SSE-LABEL: fptoui_8f64_to_8i16: -; SSE: # %bb.0: -; SSE-NEXT: cvttpd2dq %xmm3, %xmm3 -; SSE-NEXT: cvttpd2dq %xmm2, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: cvttpd2dq %xmm1, %xmm1 -; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: retq -; -; VEX-LABEL: fptoui_8f64_to_8i16: -; VEX: # %bb.0: -; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1 -; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0 -; VEX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; VEX-NEXT: vzeroupper -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptoui_8f64_to_8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_8f64_to_8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_8f64_to_8i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_8f64_to_8i16: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0 -; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq - %cvt = fptoui <8 x double> %a to <8 x i16> - ret <8 x i16> %cvt -} - -define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) { -; SSE-LABEL: fptosi_16f32_to_16i8: -; SSE: # %bb.0: -; SSE-NEXT: cvttps2dq %xmm3, %xmm3 -; SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptosi_16f32_to_16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptosi_16f32_to_16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: fptosi_16f32_to_16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %cvt = fptosi <16 x float> %a to <16 x i8> - ret <16 x i8> %cvt -} - -define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) { -; SSE-LABEL: fptoui_16f32_to_16i8: -; SSE: # %bb.0: -; SSE-NEXT: cvttps2dq %xmm3, %xmm3 -; SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: fptoui_16f32_to_16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: fptoui_16f32_to_16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: fptoui_16f32_to_16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %cvt = fptoui <16 x float> %a to <16 x i8> - ret <16 x i8> %cvt -} - -define <2 x i64> @fptosi_2f32_to_2i64_load(<2 x float>* %x) { -; SSE-LABEL: fptosi_2f32_to_2i64_load: -; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE-NEXT: cvttss2si %xmm1, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: retq -; -; VEX-LABEL: fptosi_2f32_to_2i64_load: -; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; VEX-NEXT: vcvttss2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm1 -; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; VEX-NEXT: vcvttss2si %xmm0, %rax -; VEX-NEXT: vmovq %rax, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptosi_2f32_to_2i64_load: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vcvttss2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2si %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptosi_2f32_to_2i64_load: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512VL-NEXT: vcvttss2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2si %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptosi_2f32_to_2i64_load: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64_load: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2qq (%rdi), %xmm0 -; AVX512VLDQ-NEXT: retq - %a = load <2 x float>, <2 x float>* %x - %b = fptosi <2 x float> %a to <2 x i64> - ret <2 x i64> %b -} - -define <2 x i64> @fptoui_2f32_to_2i64_load(<2 x float>* %x) { -; SSE-LABEL: fptoui_2f32_to_2i64_load: -; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: subss %xmm2, %xmm0 -; SSE-NEXT: cvttss2si %xmm0, %rax -; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm1, %rdx -; SSE-NEXT: ucomiss %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rax, %rdx -; SSE-NEXT: movq %rdx, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE-NEXT: movaps %xmm1, %xmm3 -; SSE-NEXT: subss %xmm2, %xmm3 -; SSE-NEXT: cvttss2si %xmm3, %rax -; SSE-NEXT: xorq %rcx, %rax -; SSE-NEXT: cvttss2si %xmm1, %rcx -; SSE-NEXT: ucomiss %xmm2, %xmm1 -; SSE-NEXT: cmovaeq %rax, %rcx -; SSE-NEXT: movq %rcx, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: retq -; -; VEX-LABEL: fptoui_2f32_to_2i64_load: -; VEX: # %bb.0: -; VEX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2 -; VEX-NEXT: vcvttss2si %xmm2, %rax -; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rdx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rdx -; VEX-NEXT: vmovq %rdx, %xmm2 -; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3 -; VEX-NEXT: vcvttss2si %xmm3, %rax -; VEX-NEXT: xorq %rcx, %rax -; VEX-NEXT: vcvttss2si %xmm0, %rcx -; VEX-NEXT: vucomiss %xmm1, %xmm0 -; VEX-NEXT: cmovaeq %rax, %rcx -; VEX-NEXT: vmovq %rcx, %xmm0 -; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] -; VEX-NEXT: retq -; -; AVX512F-LABEL: fptoui_2f32_to_2i64_load: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vcvttss2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm1 -; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512F-NEXT: vcvttss2usi %xmm0, %rax -; AVX512F-NEXT: vmovq %rax, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fptoui_2f32_to_2i64_load: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm1 -; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax -; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: fptoui_2f32_to_2i64_load: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64_load: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvttps2uqq (%rdi), %xmm0 -; AVX512VLDQ-NEXT: retq - %a = load <2 x float>, <2 x float>* %x - %b = fptoui <2 x float> %a to <2 x i64> - ret <2 x i64> %b -} diff --git a/test/CodeGen/X86/vec_int_to_fp-widen.ll b/test/CodeGen/X86/vec_int_to_fp-widen.ll deleted file mode 100644 index 6891a3d0245..00000000000 --- a/test/CodeGen/X86/vec_int_to_fp-widen.ll +++ /dev/null @@ -1,6008 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 -; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 -; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX1 -; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX2 -; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F -; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL -; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ -; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VLDQ -; -; 32-bit tests to make sure we're not doing anything stupid. -; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse -; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 -; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1 - -; -; Signed Integer to Double -; - -define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) { -; SSE2-LABEL: sitofp_2i64_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: cvtsi2sd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_2i64_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cvtsi2sd %rax, %xmm1 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2sd %rax, %xmm0 -; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: retq -; -; VEX-LABEL: sitofp_2i64_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 -; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; VEX-NEXT: retq -; -; AVX512F-LABEL: sitofp_2i64_to_2f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: sitofp_2i64_to_2f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: sitofp_2i64_to_2f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = sitofp <2 x i64> %a to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) { -; SSE-LABEL: sitofp_2i32_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_2i32_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> - %cvt = sitofp <2 x i32> %shuf to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) { -; SSE-LABEL: sitofp_4i32_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_4i32_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %cvt = sitofp <4 x i32> %a to <4 x double> - %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> - ret <2 x double> %shuf -} - -define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) { -; SSE2-LABEL: sitofp_2i16_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_2i16_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sitofp_2i16_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> - %cvt = sitofp <2 x i16> %shuf to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) { -; SSE2-LABEL: sitofp_8i16_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_8i16_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; VEX-LABEL: sitofp_8i16_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovsxwd %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512-LABEL: sitofp_8i16_to_2f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %cvt = sitofp <8 x i16> %a to <8 x double> - %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> - ret <2 x double> %shuf -} - -define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) { -; SSE2-LABEL: sitofp_2i8_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_2i8_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sitofp_2i8_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> - %cvt = sitofp <2 x i8> %shuf to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { -; SSE2-LABEL: sitofp_16i8_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_16i8_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; VEX-LABEL: sitofp_16i8_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovsxbd %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512-LABEL: sitofp_16i8_to_2f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %cvt = sitofp <16 x i8> %a to <16 x double> - %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> - ret <2 x double> %shuf -} - -define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { -; SSE2-LABEL: sitofp_4i64_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: cvtsi2sd %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: cvtsi2sd %rax, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_4i64_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cvtsi2sd %rax, %xmm2 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2sd %rax, %xmm0 -; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2sd %rax, %xmm2 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2sd %rax, %xmm1 -; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE41-NEXT: retq -; -; AVX1-LABEL: sitofp_4i64_to_4f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 -; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_4i64_to_4f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 -; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sitofp_4i64_to_4f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: sitofp_4i64_to_4f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: sitofp_4i64_to_4f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: sitofp_4i64_to_4f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0 -; AVX512VLDQ-NEXT: retq - %cvt = sitofp <4 x i64> %a to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) { -; SSE-LABEL: sitofp_4i32_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_4i32_to_4f64: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX-NEXT: retq - %cvt = sitofp <4 x i32> %a to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) { -; SSE2-LABEL: sitofp_4i16_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_4i16_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX-LABEL: sitofp_4i16_to_4f64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX-NEXT: retq - %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> - %cvt = sitofp <4 x i16> %shuf to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) { -; SSE2-LABEL: sitofp_8i16_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_8i16_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; VEX-LABEL: sitofp_8i16_to_4f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovsxwd %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; VEX-NEXT: retq -; -; AVX512-LABEL: sitofp_8i16_to_4f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: retq - %cvt = sitofp <8 x i16> %a to <8 x double> - %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> - ret <4 x double> %shuf -} - -define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) { -; SSE2-LABEL: sitofp_4i8_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_4i8_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX-LABEL: sitofp_4i8_to_4f64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX-NEXT: retq - %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> - %cvt = sitofp <4 x i8> %shuf to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { -; SSE2-LABEL: sitofp_16i8_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_16i8_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; VEX-LABEL: sitofp_16i8_to_4f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovsxbd %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; VEX-NEXT: retq -; -; AVX512-LABEL: sitofp_16i8_to_4f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: retq - %cvt = sitofp <16 x i8> %a to <16 x double> - %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> - ret <4 x double> %shuf -} - -; -; Unsigned Integer to Double -; - -define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { -; SSE2-LABEL: uitofp_2i64_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: por {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psrlq $32, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: subpd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_2i64_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: por {{.*}}(%rip), %xmm1 -; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-NEXT: subpd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_2i64_to_2f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_2i64_to_2f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: uitofp_2i64_to_2f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_2i64_to_2f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_2i64_to_2f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtuqq2pd %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = uitofp <2 x i64> %a to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { -; SSE2-LABEL: uitofp_2i32_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_2i32_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_2i32_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 -; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 -; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: uitofp_2i32_to_2f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_2i32_to_2f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_2i32_to_2f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> - %cvt = uitofp <2 x i32> %shuf to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) { -; SSE2-LABEL: uitofp_4i32_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_4i32_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_4i32_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 -; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: uitofp_4i32_to_2f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_4i32_to_2f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_4i32_to_2f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = uitofp <4 x i32> %a to <4 x double> - %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> - ret <2 x double> %shuf -} - -define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) { -; SSE2-LABEL: uitofp_2i16_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_2i16_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: uitofp_2i16_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> - %cvt = uitofp <2 x i16> %shuf to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) { -; SSE2-LABEL: uitofp_8i16_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_8i16_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_8i16_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512-LABEL: uitofp_8i16_to_2f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %cvt = uitofp <8 x i16> %a to <8 x double> - %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> - ret <2 x double> %shuf -} - -define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) { -; SSE2-LABEL: uitofp_2i8_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_2i8_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: uitofp_2i8_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> - %cvt = uitofp <2 x i8> %shuf to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { -; SSE2-LABEL: uitofp_16i8_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_16i8_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_16i8_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512-LABEL: uitofp_16i8_to_2f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %cvt = uitofp <16 x i8> %a to <16 x double> - %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> - ret <2 x double> %shuf -} - -define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { -; SSE2-LABEL: uitofp_4i64_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: psrlq $32, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] -; SSE2-NEXT: subpd %xmm6, %xmm0 -; SSE2-NEXT: addpd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: psrlq $32, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: subpd %xmm6, %xmm1 -; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_4i64_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] -; SSE41-NEXT: por %xmm4, %xmm3 -; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] -; SSE41-NEXT: subpd %xmm6, %xmm0 -; SSE41-NEXT: addpd %xmm3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: por %xmm4, %xmm2 -; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: subpd %xmm6, %xmm1 -; SSE41-NEXT: addpd %xmm2, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_4i64_to_4f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_4i64_to_4f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] -; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: uitofp_4i64_to_4f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_4i64_to_4f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 -; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_4i64_to_4f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_4i64_to_4f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtuqq2pd %ymm0, %ymm0 -; AVX512VLDQ-NEXT: retq - %cvt = uitofp <4 x i64> %a to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { -; SSE2-LABEL: uitofp_4i32_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] -; SSE2-NEXT: mulpd %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm5 -; SSE2-NEXT: mulpd %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: cvtdq2pd %xmm4, %xmm1 -; SSE2-NEXT: addpd %xmm5, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_4i32_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] -; SSE41-NEXT: mulpd %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm5 -; SSE41-NEXT: mulpd %xmm2, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm4, %xmm1 -; SSE41-NEXT: addpd %xmm5, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_4i32_to_4f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_4i32_to_4f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] -; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: uitofp_4i32_to_4f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_4i32_to_4f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_4i32_to_4f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_4i32_to_4f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0 -; AVX512VLDQ-NEXT: retq - %cvt = uitofp <4 x i32> %a to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) { -; SSE2-LABEL: uitofp_4i16_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_4i16_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX-LABEL: uitofp_4i16_to_4f64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX-NEXT: retq - %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> - %cvt = uitofp <4 x i16> %shuf to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) { -; SSE2-LABEL: uitofp_8i16_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_8i16_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_8i16_to_4f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; VEX-NEXT: retq -; -; AVX512-LABEL: uitofp_8i16_to_4f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: retq - %cvt = uitofp <8 x i16> %a to <8 x double> - %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> - ret <4 x double> %shuf -} - -define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) { -; SSE2-LABEL: uitofp_4i8_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_4i8_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX-LABEL: uitofp_4i8_to_4f64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX-NEXT: retq - %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> - %cvt = uitofp <4 x i8> %shuf to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { -; SSE2-LABEL: uitofp_16i8_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_16i8_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_16i8_to_4f64: -; VEX: # %bb.0: -; VEX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; VEX-NEXT: retq -; -; AVX512-LABEL: uitofp_16i8_to_4f64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0 -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: retq - %cvt = uitofp <16 x i8> %a to <16 x double> - %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> - ret <4 x double> %shuf -} - -; -; Signed Integer to Float -; - -define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { -; SSE2-LABEL: sitofp_2i64_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_2i64_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; SSE41-NEXT: retq -; -; VEX-LABEL: sitofp_2i64_to_4f32: -; VEX: # %bb.0: -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; VEX-NEXT: retq -; -; AVX512F-LABEL: sitofp_2i64_to_4f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: sitofp_2i64_to_4f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: sitofp_2i64_to_4f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = sitofp <2 x i64> %a to <2 x float> - %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> - ret <4 x float> %ext -} - -define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) { -; SSE2-LABEL: sitofp_2i64_to_4f32_zero: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_2i64_to_4f32_zero: -; SSE41: # %bb.0: -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; VEX-LABEL: sitofp_2i64_to_4f32_zero: -; VEX: # %bb.0: -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; VEX-NEXT: retq -; -; AVX512F-LABEL: sitofp_2i64_to_4f32_zero: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: sitofp_2i64_to_4f32_zero: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = sitofp <2 x i64> %a to <2 x float> - %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> - ret <4 x float> %ext -} - -define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { -; SSE2-LABEL: sitofp_4i64_to_4f32_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_4i64_to_4f32_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; SSE41-NEXT: retq -; -; VEX-LABEL: sitofp_4i64_to_4f32_undef: -; VEX: # %bb.0: -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; VEX-NEXT: retq -; -; AVX512F-LABEL: sitofp_4i64_to_4f32_undef: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq - %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> - %cvt = sitofp <4 x i64> %ext to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) { -; SSE-LABEL: sitofp_4i32_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_4i32_to_4f32: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq - %cvt = sitofp <4 x i32> %a to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) { -; SSE2-LABEL: sitofp_4i16_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_4i16_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sitofp_4i16_to_4f32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq - %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> - %cvt = sitofp <4 x i16> %shuf to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) { -; SSE2-LABEL: sitofp_8i16_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_8i16_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sitofp_8i16_to_4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_8i16_to_4f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: sitofp_8i16_to_4f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %cvt = sitofp <8 x i16> %a to <8 x float> - %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> - ret <4 x float> %shuf -} - -define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) { -; SSE2-LABEL: sitofp_4i8_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_4i8_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sitofp_4i8_to_4f32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq - %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> - %cvt = sitofp <4 x i8> %shuf to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { -; SSE2-LABEL: sitofp_16i8_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_16i8_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sitofp_16i8_to_4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_16i8_to_4f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: sitofp_16i8_to_4f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %cvt = sitofp <16 x i8> %a to <16 x float> - %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> - ret <4 x float> %shuf -} - -define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { -; SSE2-LABEL: sitofp_4i64_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_4i64_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; SSE41-NEXT: retq -; -; AVX1-LABEL: sitofp_4i64_to_4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_4i64_to_4f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sitofp_4i64_to_4f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: sitofp_4i64_to_4f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: sitofp_4i64_to_4f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq - %cvt = sitofp <4 x i64> %a to <4 x float> - ret <4 x float> %cvt -} - -define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) { -; SSE-LABEL: sitofp_8i32_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_8i32_to_8f32: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX-NEXT: retq - %cvt = sitofp <8 x i32> %a to <8 x float> - ret <8 x float> %cvt -} - -define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) { -; SSE2-LABEL: sitofp_8i16_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_8i16_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 -; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sitofp_8i16_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_8i16_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sitofp_8i16_to_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: retq - %cvt = sitofp <8 x i16> %a to <8 x float> - ret <8 x float> %cvt -} - -define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) { -; SSE2-LABEL: sitofp_8i8_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_8i8_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 -; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sitofp_8i8_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_8i8_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sitofp_8i8_to_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: retq - %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> - %cvt = sitofp <8 x i8> %shuf to <8 x float> - ret <8 x float> %cvt -} - -define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) { -; SSE2-LABEL: sitofp_16i8_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_16i8_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 -; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sitofp_16i8_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_16i8_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sitofp_16i8_to_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: retq - %cvt = sitofp <16 x i8> %a to <16 x float> - %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> - ret <8 x float> %shuf -} - -; -; Unsigned Integer to Float -; - -define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { -; SSE2-LABEL: uitofp_2i64_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB39_1 -; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB39_3 -; SSE2-NEXT: .LBB39_1: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: addss %xmm0, %xmm0 -; SSE2-NEXT: .LBB39_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB39_4 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; SSE2-NEXT: .LBB39_4: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: addss %xmm1, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_2i64_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB39_1 -; SSE41-NEXT: # %bb.2: -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: jmp .LBB39_3 -; SSE41-NEXT: .LBB39_1: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: addss %xmm1, %xmm1 -; SSE41-NEXT: .LBB39_3: -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB39_4 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; SSE41-NEXT: retq -; SSE41-NEXT: .LBB39_4: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_2i64_to_4f32: -; VEX: # %bb.0: -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB39_1 -; VEX-NEXT: # %bb.2: -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: jmp .LBB39_3 -; VEX-NEXT: .LBB39_1: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB39_3: -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB39_4 -; VEX-NEXT: # %bb.5: -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; VEX-NEXT: retq -; VEX-NEXT: .LBB39_4: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; VEX-NEXT: retq -; -; AVX512F-LABEL: uitofp_2i64_to_4f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_2i64_to_4f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_2i64_to_4f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtuqq2ps %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = uitofp <2 x i64> %a to <2 x float> - %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> - ret <4 x float> %ext -} - -define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { -; SSE2-LABEL: uitofp_2i64_to_2f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB40_1 -; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: jmp .LBB40_3 -; SSE2-NEXT: .LBB40_1: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: addss %xmm1, %xmm1 -; SSE2-NEXT: .LBB40_3: -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB40_4 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB40_6 -; SSE2-NEXT: .LBB40_4: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: addss %xmm0, %xmm0 -; SSE2-NEXT: .LBB40_6: -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_2i64_to_2f32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB40_1 -; SSE41-NEXT: # %bb.2: -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: jmp .LBB40_3 -; SSE41-NEXT: .LBB40_1: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: .LBB40_3: -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB40_4 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; SSE41-NEXT: retq -; SSE41-NEXT: .LBB40_4: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: addss %xmm1, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_2i64_to_2f32: -; VEX: # %bb.0: -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB40_1 -; VEX-NEXT: # %bb.2: -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: jmp .LBB40_3 -; VEX-NEXT: .LBB40_1: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB40_3: -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB40_4 -; VEX-NEXT: # %bb.5: -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; VEX-NEXT: retq -; VEX-NEXT: .LBB40_4: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; VEX-NEXT: retq -; -; AVX512F-LABEL: uitofp_2i64_to_2f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_2i64_to_2f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_2i64_to_2f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 -; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtuqq2ps %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = uitofp <2 x i64> %a to <2 x float> - %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> - ret <4 x float> %ext -} - -define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { -; SSE2-LABEL: uitofp_4i64_to_4f32_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB41_1 -; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: jmp .LBB41_3 -; SSE2-NEXT: .LBB41_1: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: addss %xmm1, %xmm1 -; SSE2-NEXT: .LBB41_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB41_4 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB41_6 -; SSE2-NEXT: .LBB41_4: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: addss %xmm0, %xmm0 -; SSE2-NEXT: .LBB41_6: -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_4i64_to_4f32_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB41_1 -; SSE41-NEXT: # %bb.2: -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: jmp .LBB41_3 -; SSE41-NEXT: .LBB41_1: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: addss %xmm1, %xmm1 -; SSE41-NEXT: .LBB41_3: -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB41_4 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; SSE41-NEXT: retq -; SSE41-NEXT: .LBB41_4: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_4i64_to_4f32_undef: -; VEX: # %bb.0: -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB41_1 -; VEX-NEXT: # %bb.2: -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: jmp .LBB41_3 -; VEX-NEXT: .LBB41_1: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB41_3: -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB41_4 -; VEX-NEXT: # %bb.5: -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; VEX-NEXT: retq -; VEX-NEXT: .LBB41_4: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 -; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; VEX-NEXT: retq -; -; AVX512F-LABEL: uitofp_4i64_to_4f32_undef: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq - %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> - %cvt = uitofp <4 x i64> %ext to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) { -; SSE2-LABEL: uitofp_4i32_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: por {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addps %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_4i32_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; SSE41-NEXT: addps {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_4i32_to_4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_4i32_to_4f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] -; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: uitofp_4i32_to_4f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_4i32_to_4f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_4i32_to_4f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %cvt = uitofp <4 x i32> %a to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) { -; SSE2-LABEL: uitofp_4i16_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_4i16_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: uitofp_4i16_to_4f32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq - %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> - %cvt = uitofp <4 x i16> %shuf to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) { -; SSE2-LABEL: uitofp_8i16_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_8i16_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_8i16_to_4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_8i16_to_4f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: uitofp_8i16_to_4f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %cvt = uitofp <8 x i16> %a to <8 x float> - %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> - ret <4 x float> %shuf -} - -define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) { -; SSE2-LABEL: uitofp_4i8_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_4i8_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: uitofp_4i8_to_4f32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq - %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> - %cvt = uitofp <4 x i8> %shuf to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { -; SSE2-LABEL: uitofp_16i8_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_16i8_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_16i8_to_4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_16i8_to_4f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: uitofp_16i8_to_4f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %cvt = uitofp <16 x i8> %a to <16 x float> - %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> - ret <4 x float> %shuf -} - -define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { -; SSE2-LABEL: uitofp_4i64_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB47_1 -; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: jmp .LBB47_3 -; SSE2-NEXT: .LBB47_1: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: addss %xmm2, %xmm2 -; SSE2-NEXT: .LBB47_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB47_4 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: jmp .LBB47_6 -; SSE2-NEXT: .LBB47_4: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: addss %xmm3, %xmm3 -; SSE2-NEXT: .LBB47_6: -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB47_7 -; SSE2-NEXT: # %bb.8: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: jmp .LBB47_9 -; SSE2-NEXT: .LBB47_7: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: addss %xmm1, %xmm1 -; SSE2-NEXT: .LBB47_9: -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB47_10 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB47_12 -; SSE2-NEXT: .LBB47_10: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: addss %xmm0, %xmm0 -; SSE2-NEXT: .LBB47_12: -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_4i64_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB47_1 -; SSE41-NEXT: # %bb.2: -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: jmp .LBB47_3 -; SSE41-NEXT: .LBB47_1: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: addss %xmm2, %xmm2 -; SSE41-NEXT: .LBB47_3: -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB47_4 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: jmp .LBB47_6 -; SSE41-NEXT: .LBB47_4: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: .LBB47_6: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB47_7 -; SSE41-NEXT: # %bb.8: -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: jmp .LBB47_9 -; SSE41-NEXT: .LBB47_7: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: addss %xmm2, %xmm2 -; SSE41-NEXT: .LBB47_9: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB47_10 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; SSE41-NEXT: retq -; SSE41-NEXT: .LBB47_10: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: addss %xmm1, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_4i64_to_4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB47_1 -; AVX1-NEXT: # %bb.2: -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX1-NEXT: jmp .LBB47_3 -; AVX1-NEXT: .LBB47_1: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: .LBB47_3: -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB47_4 -; AVX1-NEXT: # %bb.5: -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: jmp .LBB47_6 -; AVX1-NEXT: .LBB47_4: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB47_6: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB47_7 -; AVX1-NEXT: # %bb.8: -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX1-NEXT: jmp .LBB47_9 -; AVX1-NEXT: .LBB47_7: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: .LBB47_9: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: testq %rax, %rax -; AVX1-NEXT: js .LBB47_10 -; AVX1-NEXT: # %bb.11: -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB47_10: -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shrq %rcx -; AVX1-NEXT: andl $1, %eax -; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_4i64_to_4f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB47_1 -; AVX2-NEXT: # %bb.2: -; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX2-NEXT: jmp .LBB47_3 -; AVX2-NEXT: .LBB47_1: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: .LBB47_3: -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB47_4 -; AVX2-NEXT: # %bb.5: -; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX2-NEXT: jmp .LBB47_6 -; AVX2-NEXT: .LBB47_4: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: .LBB47_6: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB47_7 -; AVX2-NEXT: # %bb.8: -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-NEXT: jmp .LBB47_9 -; AVX2-NEXT: .LBB47_7: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: .LBB47_9: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: testq %rax, %rax -; AVX2-NEXT: js .LBB47_10 -; AVX2-NEXT: # %bb.11: -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; AVX2-NEXT: .LBB47_10: -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shrq %rcx -; AVX2-NEXT: andl $1, %eax -; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: uitofp_4i64_to_4f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_4i64_to_4f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_4i64_to_4f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0 -; AVX512VLDQ-NEXT: vzeroupper -; AVX512VLDQ-NEXT: retq - %cvt = uitofp <4 x i64> %a to <4 x float> - ret <4 x float> %cvt -} - -define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) { -; SSE2-LABEL: uitofp_8i32_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] -; SSE2-NEXT: addps %xmm6, %xmm0 -; SSE2-NEXT: addps %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: addps %xmm6, %xmm1 -; SSE2-NEXT: addps %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_8i32_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] -; SSE41-NEXT: movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] -; SSE41-NEXT: addps %xmm5, %xmm0 -; SSE41-NEXT: addps %xmm3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] -; SSE41-NEXT: addps %xmm5, %xmm1 -; SSE41-NEXT: addps %xmm2, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_8i32_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 -; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_8i32_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] -; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: uitofp_8i32_to_8f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_8i32_to_8f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtudq2ps %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_8i32_to_8f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_8i32_to_8f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtudq2ps %ymm0, %ymm0 -; AVX512VLDQ-NEXT: retq - %cvt = uitofp <8 x i32> %a to <8 x float> - ret <8 x float> %cvt -} - -define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) { -; SSE2-LABEL: uitofp_8i16_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_8i16_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_8i16_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_8i16_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: uitofp_8i16_to_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: retq - %cvt = uitofp <8 x i16> %a to <8 x float> - ret <8 x float> %cvt -} - -define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { -; SSE2-LABEL: uitofp_8i8_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_8i8_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_8i8_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_8i8_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: uitofp_8i8_to_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: retq - %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> - %cvt = uitofp <8 x i8> %shuf to <8 x float> - ret <8 x float> %cvt -} - -define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { -; SSE2-LABEL: uitofp_16i8_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_16i8_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_16i8_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_16i8_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: uitofp_16i8_to_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0 -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: retq - %cvt = uitofp <16 x i8> %a to <16 x float> - %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> - ret <8 x float> %shuf -} - -; -; Load Signed Integer to Double -; - -define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) { -; SSE2-LABEL: sitofp_load_2i64_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sd %rax, %xmm1 -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_2i64_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cvtsi2sd %rax, %xmm1 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2sd %rax, %xmm0 -; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: retq -; -; VEX-LABEL: sitofp_load_2i64_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vmovdqa (%rdi), %xmm0 -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 -; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; VEX-NEXT: retq -; -; AVX512F-LABEL: sitofp_load_2i64_to_2f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: sitofp_load_2i64_to_2f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 -; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtqq2pd (%rdi), %xmm0 -; AVX512VLDQ-NEXT: retq - %ld = load <2 x i64>, <2 x i64> *%a - %cvt = sitofp <2 x i64> %ld to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) { -; SSE-LABEL: sitofp_load_2i32_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_load_2i32_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 -; AVX-NEXT: retq - %ld = load <2 x i32>, <2 x i32> *%a - %cvt = sitofp <2 x i32> %ld to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) { -; SSE-LABEL: sitofp_volatile_load_4i32_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %ld = load volatile <4 x i32>, <4 x i32> *%a - %b = shufflevector <4 x i32> %ld, <4 x i32> undef, <2 x i32> - %cvt = sitofp <2 x i32> %b to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { -; SSE-LABEL: sitofp_load_4i32_to_2f64_2: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2pd (%rdi), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_load_4i32_to_2f64_2: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0 -; AVX-NEXT: retq - %a = load <4 x i32>, <4 x i32>* %x - %b = sitofp <4 x i32> %a to <4 x double> - %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> - ret <2 x double> %c -} - -define <2 x double> @sitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) { -; SSE-LABEL: sitofp_volatile_load_4i32_to_2f64_2: -; SSE: # %bb.0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64_2: -; AVX: # %bb.0: -; AVX-NEXT: vmovaps (%rdi), %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %a = load volatile <4 x i32>, <4 x i32>* %x - %b = sitofp <4 x i32> %a to <4 x double> - %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> - ret <2 x double> %c -} - -define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) { -; SSE2-LABEL: sitofp_load_2i16_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_2i16_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sitofp_load_2i16_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %ld = load <2 x i16>, <2 x i16> *%a - %cvt = sitofp <2 x i16> %ld to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) { -; SSE2-LABEL: sitofp_load_2i8_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movzwl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_2i8_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movzwl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sitofp_load_2i8_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %ld = load <2 x i8>, <2 x i8> *%a - %cvt = sitofp <2 x i8> %ld to <2 x double> - ret <2 x double> %cvt -} - -define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { -; SSE2-LABEL: sitofp_load_4i64_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm2 -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sd %rax, %xmm1 -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2sd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2sd %rax, %xmm2 -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_4i64_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa 16(%rdi), %xmm1 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cvtsi2sd %rax, %xmm2 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2sd %rax, %xmm0 -; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2sd %rax, %xmm2 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2sd %rax, %xmm1 -; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE41-NEXT: retq -; -; VEX-LABEL: sitofp_load_4i64_to_4f64: -; VEX: # %bb.0: -; VEX-NEXT: vmovdqa (%rdi), %xmm0 -; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 -; VEX-NEXT: vpextrq $1, %xmm1, %rax -; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 -; VEX-NEXT: vmovq %xmm1, %rax -; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; VEX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: sitofp_load_4i64_to_4f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: sitofp_load_4i64_to_4f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 -; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtqq2pd (%rdi), %ymm0 -; AVX512VLDQ-NEXT: retq - %ld = load <4 x i64>, <4 x i64> *%a - %cvt = sitofp <4 x i64> %ld to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) { -; SSE-LABEL: sitofp_load_4i32_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_load_4i32_to_4f64: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2pd (%rdi), %ymm0 -; AVX-NEXT: retq - %ld = load <4 x i32>, <4 x i32> *%a - %cvt = sitofp <4 x i32> %ld to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) { -; SSE2-LABEL: sitofp_load_4i16_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_4i16_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwd (%rdi), %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX-LABEL: sitofp_load_4i16_to_4f64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX-NEXT: retq - %ld = load <4 x i16>, <4 x i16> *%a - %cvt = sitofp <4 x i16> %ld to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) { -; SSE2-LABEL: sitofp_load_4i8_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_4i8_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd (%rdi), %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX-LABEL: sitofp_load_4i8_to_4f64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 -; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX-NEXT: retq - %ld = load <4 x i8>, <4 x i8> *%a - %cvt = sitofp <4 x i8> %ld to <4 x double> - ret <4 x double> %cvt -} - -; -; Load Unsigned Integer to Double -; - -define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) { -; SSE2-LABEL: uitofp_load_2i64_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: por {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psrlq $32, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: subpd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_2i64_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: por {{.*}}(%rip), %xmm1 -; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: por {{.*}}(%rip), %xmm0 -; SSE41-NEXT: subpd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_load_2i64_to_2f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_2i64_to_2f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX2-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: uitofp_load_2i64_to_2f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] -; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_load_2i64_to_2f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 -; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 -; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtuqq2pd (%rdi), %xmm0 -; AVX512VLDQ-NEXT: retq - %ld = load <2 x i64>, <2 x i64> *%a - %cvt = uitofp <2 x i64> %ld to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) { -; SSE2-LABEL: uitofp_load_2i32_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_2i32_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_load_2i32_to_2f64: -; VEX: # %bb.0: -; VEX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 -; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 -; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: uitofp_load_2i32_to_2f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_load_2i32_to_2f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_load_2i32_to_2f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0 -; AVX512VLDQ-NEXT: retq - %ld = load <2 x i32>, <2 x i32> *%a - %cvt = uitofp <2 x i32> %ld to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) { -; SSE2-LABEL: uitofp_load_4i32_to_2f64_2: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_4i32_to_2f64_2: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_load_4i32_to_2f64_2: -; VEX: # %bb.0: -; VEX-NEXT: vmovdqa (%rdi), %xmm0 -; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 -; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0 -; AVX512VLDQ-NEXT: retq - %a = load <4 x i32>, <4 x i32>* %x - %b = uitofp <4 x i32> %a to <4 x double> - %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> - ret <2 x double> %c -} - -define <2 x double> @uitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) { -; SSE2-LABEL: uitofp_volatile_load_4i32_to_2f64_2: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_volatile_load_4i32_to_2f64_2: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_volatile_load_4i32_to_2f64_2: -; VEX: # %bb.0: -; VEX-NEXT: vmovdqa (%rdi), %xmm0 -; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; VEX-NEXT: vpsrld $16, %xmm0, %xmm0 -; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1 -; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0 -; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: uitofp_volatile_load_4i32_to_2f64_2: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_volatile_load_4i32_to_2f64_2: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovaps (%rdi), %xmm0 -; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 -; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0 -; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %a = load volatile <4 x i32>, <4 x i32>* %x - %b = uitofp <4 x i32> %a to <4 x double> - %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> - ret <2 x double> %c -} - -define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) { -; SSE2-LABEL: uitofp_load_2i16_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_2i16_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: uitofp_load_2i16_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %ld = load <2 x i16>, <2 x i16> *%a - %cvt = uitofp <2 x i16> %ld to <2 x double> - ret <2 x double> %cvt -} - -define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) { -; SSE2-LABEL: uitofp_load_2i8_to_2f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movzwl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_2i8_to_2f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movzwl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: uitofp_load_2i8_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %ld = load <2 x i8>, <2 x i8> *%a - %cvt = uitofp <2 x i8> %ld to <2 x double> - ret <2 x double> %cvt -} - -define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { -; SSE2-LABEL: uitofp_load_4i64_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: psrlq $32, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] -; SSE2-NEXT: subpd %xmm6, %xmm0 -; SSE2-NEXT: addpd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: psrlq $32, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: subpd %xmm6, %xmm1 -; SSE2-NEXT: addpd %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_4i64_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa 16(%rdi), %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200] -; SSE41-NEXT: por %xmm4, %xmm3 -; SSE41-NEXT: psrlq $32, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072] -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] -; SSE41-NEXT: subpd %xmm6, %xmm0 -; SSE41-NEXT: addpd %xmm3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: por %xmm4, %xmm2 -; SSE41-NEXT: psrlq $32, %xmm1 -; SSE41-NEXT: por %xmm5, %xmm1 -; SSE41-NEXT: subpd %xmm6, %xmm1 -; SSE41-NEXT: addpd %xmm2, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_load_4i64_to_4f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7] -; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_4i64_to_4f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] -; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: uitofp_load_4i64_to_4f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] -; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] -; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_load_4i64_to_4f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 -; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 -; AVX512VL-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 -; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtuqq2pd (%rdi), %ymm0 -; AVX512VLDQ-NEXT: retq - %ld = load <4 x i64>, <4 x i64> *%a - %cvt = uitofp <4 x i64> %ld to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) { -; SSE2-LABEL: uitofp_load_4i32_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] -; SSE2-NEXT: mulpd %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE2-NEXT: addpd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm5 -; SSE2-NEXT: mulpd %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: cvtdq2pd %xmm4, %xmm1 -; SSE2-NEXT: addpd %xmm5, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_4i32_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4] -; SSE41-NEXT: mulpd %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE41-NEXT: addpd %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm5 -; SSE41-NEXT: mulpd %xmm2, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7] -; SSE41-NEXT: cvtdq2pd %xmm4, %xmm1 -; SSE41-NEXT: addpd %xmm5, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_load_4i32_to_4f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_4i32_to_4f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1 -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4] -; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: uitofp_load_4i32_to_4f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_load_4i32_to_4f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtudq2pd (%rdi), %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_load_4i32_to_4f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 -; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %ymm0 -; AVX512VLDQ-NEXT: retq - %ld = load <4 x i32>, <4 x i32> *%a - %cvt = uitofp <4 x i32> %ld to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) { -; SSE2-LABEL: uitofp_load_4i16_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_4i16_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX-LABEL: uitofp_load_4i16_to_4f64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX-NEXT: retq - %ld = load <4 x i16>, <4 x i16> *%a - %cvt = uitofp <4 x i16> %ld to <4 x double> - ret <4 x double> %cvt -} - -define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) { -; SSE2-LABEL: uitofp_load_4i8_to_4f64: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_4i8_to_4f64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX-LABEL: uitofp_load_4i8_to_4f64: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0 -; AVX-NEXT: retq - %ld = load <4 x i8>, <4 x i8> *%a - %cvt = uitofp <4 x i8> %ld to <4 x double> - ret <4 x double> %cvt -} - -; -; Load Signed Integer to Float -; - -define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { -; SSE2-LABEL: sitofp_load_4i64_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_4i64_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa 16(%rdi), %xmm1 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; SSE41-NEXT: retq -; -; VEX-LABEL: sitofp_load_4i64_to_4f32: -; VEX: # %bb.0: -; VEX-NEXT: vmovdqa (%rdi), %xmm0 -; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; VEX-NEXT: vmovq %xmm1, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; VEX-NEXT: vpextrq $1, %xmm1, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; VEX-NEXT: retq -; -; AVX512F-LABEL: sitofp_load_4i64_to_4f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: sitofp_load_4i64_to_4f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 -; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0 -; AVX512VLDQ-NEXT: retq - %ld = load <4 x i64>, <4 x i64> *%a - %cvt = sitofp <4 x i64> %ld to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) { -; SSE-LABEL: sitofp_load_4i32_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2ps (%rdi), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_load_4i32_to_4f32: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2ps (%rdi), %xmm0 -; AVX-NEXT: retq - %ld = load <4 x i32>, <4 x i32> *%a - %cvt = sitofp <4 x i32> %ld to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) { -; SSE2-LABEL: sitofp_load_4i16_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_4i16_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sitofp_load_4i16_to_4f32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq - %ld = load <4 x i16>, <4 x i16> *%a - %cvt = sitofp <4 x i16> %ld to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) { -; SSE2-LABEL: sitofp_load_4i8_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_4i8_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sitofp_load_4i8_to_4f32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq - %ld = load <4 x i8>, <4 x i8> *%a - %cvt = sitofp <4 x i8> %ld to <4 x float> - ret <4 x float> %cvt -} - -define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { -; SSE2-LABEL: sitofp_load_8i64_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm3 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: xorps %xmm4, %xmm4 -; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_8i64_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa 16(%rdi), %xmm1 -; SSE41-NEXT: movdqa 32(%rdi), %xmm2 -; SSE41-NEXT: movdqa 48(%rdi), %xmm3 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm4 -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: xorps %xmm4, %xmm4 -; SSE41-NEXT: cvtsi2ss %rax, %xmm4 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3] -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; SSE41-NEXT: pextrq $1, %xmm2, %rax -; SSE41-NEXT: xorps %xmm4, %xmm4 -; SSE41-NEXT: cvtsi2ss %rax, %xmm4 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3] -; SSE41-NEXT: movq %xmm3, %rax -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; SSE41-NEXT: pextrq $1, %xmm3, %rax -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; SSE41-NEXT: retq -; -; VEX-LABEL: sitofp_load_8i64_to_8f32: -; VEX: # %bb.0: -; VEX-NEXT: vmovdqa (%rdi), %xmm0 -; VEX-NEXT: vmovdqa 16(%rdi), %xmm1 -; VEX-NEXT: vmovdqa 32(%rdi), %xmm2 -; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 -; VEX-NEXT: vpextrq $1, %xmm2, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; VEX-NEXT: vmovq %xmm2, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 -; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] -; VEX-NEXT: vmovq %xmm3, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; VEX-NEXT: vpextrq $1, %xmm3, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] -; VEX-NEXT: vmovq %xmm1, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] -; VEX-NEXT: vpextrq $1, %xmm1, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; VEX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: sitofp_load_8i64_to_8f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vpextrq $1, %xmm2, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] -; AVX512F-NEXT: vmovq %xmm3, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; AVX512F-NEXT: vpextrq $1, %xmm3, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: sitofp_load_8i64_to_8f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] -; AVX512VL-NEXT: vmovq %xmm3, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcvtqq2ps (%rdi), %ymm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: sitofp_load_8i64_to_8f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtqq2ps (%rdi), %ymm0 -; AVX512VLDQ-NEXT: retq - %ld = load <8 x i64>, <8 x i64> *%a - %cvt = sitofp <8 x i64> %ld to <8 x float> - ret <8 x float> %cvt -} - -define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) { -; SSE-LABEL: sitofp_load_8i32_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2ps (%rdi), %xmm0 -; SSE-NEXT: cvtdq2ps 16(%rdi), %xmm1 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_load_8i32_to_8f32: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2ps (%rdi), %ymm0 -; AVX-NEXT: retq - %ld = load <8 x i32>, <8 x i32> *%a - %cvt = sitofp <8 x i32> %ld to <8 x float> - ret <8 x float> %cvt -} - -define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) { -; SSE2-LABEL: sitofp_load_8i16_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_8i16_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1 -; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sitofp_load_8i16_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxwd (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_load_8i16_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0 -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sitofp_load_8i16_to_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0 -; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: retq - %ld = load <8 x i16>, <8 x i16> *%a - %cvt = sitofp <8 x i16> %ld to <8 x float> - ret <8 x float> %cvt -} - -define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) { -; SSE2-LABEL: sitofp_load_8i8_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: sitofp_load_8i8_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbd 4(%rdi), %xmm1 -; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sitofp_load_8i8_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxbd (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sitofp_load_8i8_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0 -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sitofp_load_8i8_to_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0 -; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: retq - %ld = load <8 x i8>, <8 x i8> *%a - %cvt = sitofp <8 x i8> %ld to <8 x float> - ret <8 x float> %cvt -} - -; -; Load Unsigned Integer to Float -; - -define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { -; SSE2-LABEL: uitofp_load_4i64_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm2 -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_1 -; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: jmp .LBB81_3 -; SSE2-NEXT: .LBB81_1: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: addss %xmm1, %xmm1 -; SSE2-NEXT: .LBB81_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_4 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: jmp .LBB81_6 -; SSE2-NEXT: .LBB81_4: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: addss %xmm3, %xmm3 -; SSE2-NEXT: .LBB81_6: -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_7 -; SSE2-NEXT: # %bb.8: -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB81_9 -; SSE2-NEXT: .LBB81_7: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: addss %xmm0, %xmm0 -; SSE2-NEXT: .LBB81_9: -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB81_10 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: jmp .LBB81_12 -; SSE2-NEXT: .LBB81_10: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: addss %xmm2, %xmm2 -; SSE2-NEXT: .LBB81_12: -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_4i64_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa 16(%rdi), %xmm1 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_1 -; SSE41-NEXT: # %bb.2: -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: jmp .LBB81_3 -; SSE41-NEXT: .LBB81_1: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: addss %xmm2, %xmm2 -; SSE41-NEXT: .LBB81_3: -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_4 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: jmp .LBB81_6 -; SSE41-NEXT: .LBB81_4: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: .LBB81_6: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_7 -; SSE41-NEXT: # %bb.8: -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: jmp .LBB81_9 -; SSE41-NEXT: .LBB81_7: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: addss %xmm2, %xmm2 -; SSE41-NEXT: .LBB81_9: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB81_10 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; SSE41-NEXT: retq -; SSE41-NEXT: .LBB81_10: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: addss %xmm1, %xmm1 -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_load_4i64_to_4f32: -; VEX: # %bb.0: -; VEX-NEXT: vmovdqa (%rdi), %xmm2 -; VEX-NEXT: vmovdqa 16(%rdi), %xmm0 -; VEX-NEXT: vpextrq $1, %xmm2, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_1 -; VEX-NEXT: # %bb.2: -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: jmp .LBB81_3 -; VEX-NEXT: .LBB81_1: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 -; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB81_3: -; VEX-NEXT: vmovq %xmm2, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_4 -; VEX-NEXT: # %bb.5: -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: jmp .LBB81_6 -; VEX-NEXT: .LBB81_4: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB81_6: -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_7 -; VEX-NEXT: # %bb.8: -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: jmp .LBB81_9 -; VEX-NEXT: .LBB81_7: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 -; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB81_9: -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB81_10 -; VEX-NEXT: # %bb.11: -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; VEX-NEXT: retq -; VEX-NEXT: .LBB81_10: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 -; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; VEX-NEXT: retq -; -; AVX512F-LABEL: uitofp_load_4i64_to_4f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_load_4i64_to_4f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 -; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtuqq2psy (%rdi), %xmm0 -; AVX512VLDQ-NEXT: retq - %ld = load <4 x i64>, <4 x i64> *%a - %cvt = uitofp <4 x i64> %ld to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) { -; SSE2-LABEL: uitofp_load_4i32_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: por {{.*}}(%rip), %xmm1 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: por {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addps {{.*}}(%rip), %xmm0 -; SSE2-NEXT: addps %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_4i32_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; SSE41-NEXT: addps {{.*}}(%rip), %xmm0 -; SSE41-NEXT: addps %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_load_4i32_to_4f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] -; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_4i32_to_4f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928] -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] -; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] -; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: uitofp_load_4i32_to_4f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %xmm0 -; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_load_4i32_to_4f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtudq2ps (%rdi), %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_load_4i32_to_4f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 -; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtudq2ps (%rdi), %xmm0 -; AVX512VLDQ-NEXT: retq - %ld = load <4 x i32>, <4 x i32> *%a - %cvt = uitofp <4 x i32> %ld to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) { -; SSE2-LABEL: uitofp_load_4i16_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_4i16_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: uitofp_load_4i16_to_4f32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq - %ld = load <4 x i16>, <4 x i16> *%a - %cvt = uitofp <4 x i16> %ld to <4 x float> - ret <4 x float> %cvt -} - -define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) { -; SSE2-LABEL: uitofp_load_4i8_to_4f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_4i8_to_4f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: uitofp_load_4i8_to_4f32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq - %ld = load <4 x i8>, <4 x i8> *%a - %cvt = uitofp <4 x i8> %ld to <4 x float> - ret <4 x float> %cvt -} - -define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { -; SSE2-LABEL: uitofp_load_8i64_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm5 -; SSE2-NEXT: movdqa 16(%rdi), %xmm0 -; SSE2-NEXT: movdqa 32(%rdi), %xmm2 -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB85_1 -; SSE2-NEXT: # %bb.2: -; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: jmp .LBB85_3 -; SSE2-NEXT: .LBB85_1: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm3 -; SSE2-NEXT: addss %xmm3, %xmm3 -; SSE2-NEXT: .LBB85_3: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB85_4 -; SSE2-NEXT: # %bb.5: -; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: jmp .LBB85_6 -; SSE2-NEXT: .LBB85_4: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm4 -; SSE2-NEXT: addss %xmm4, %xmm4 -; SSE2-NEXT: .LBB85_6: -; SSE2-NEXT: movq %xmm5, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB85_7 -; SSE2-NEXT: # %bb.8: -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: jmp .LBB85_9 -; SSE2-NEXT: .LBB85_7: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: addss %xmm0, %xmm0 -; SSE2-NEXT: .LBB85_9: -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] -; SSE2-NEXT: movq %xmm5, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB85_10 -; SSE2-NEXT: # %bb.11: -; SSE2-NEXT: cvtsi2ss %rax, %xmm6 -; SSE2-NEXT: jmp .LBB85_12 -; SSE2-NEXT: .LBB85_10: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm6 -; SSE2-NEXT: addss %xmm6, %xmm6 -; SSE2-NEXT: .LBB85_12: -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB85_13 -; SSE2-NEXT: # %bb.14: -; SSE2-NEXT: xorps %xmm5, %xmm5 -; SSE2-NEXT: cvtsi2ss %rax, %xmm5 -; SSE2-NEXT: jmp .LBB85_15 -; SSE2-NEXT: .LBB85_13: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm5, %xmm5 -; SSE2-NEXT: cvtsi2ss %rax, %xmm5 -; SSE2-NEXT: addss %xmm5, %xmm5 -; SSE2-NEXT: .LBB85_15: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm1, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB85_16 -; SSE2-NEXT: # %bb.17: -; SSE2-NEXT: cvtsi2ss %rax, %xmm7 -; SSE2-NEXT: jmp .LBB85_18 -; SSE2-NEXT: .LBB85_16: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: cvtsi2ss %rax, %xmm7 -; SSE2-NEXT: addss %xmm7, %xmm7 -; SSE2-NEXT: .LBB85_18: -; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB85_19 -; SSE2-NEXT: # %bb.20: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: jmp .LBB85_21 -; SSE2-NEXT: .LBB85_19: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: cvtsi2ss %rax, %xmm1 -; SSE2-NEXT: addss %xmm1, %xmm1 -; SSE2-NEXT: .LBB85_21: -; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: testq %rax, %rax -; SSE2-NEXT: js .LBB85_22 -; SSE2-NEXT: # %bb.23: -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: jmp .LBB85_24 -; SSE2-NEXT: .LBB85_22: -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq %rcx -; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: cvtsi2ss %rax, %xmm2 -; SSE2-NEXT: addss %xmm2, %xmm2 -; SSE2-NEXT: .LBB85_24: -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_8i64_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa 16(%rdi), %xmm4 -; SSE41-NEXT: movdqa 32(%rdi), %xmm1 -; SSE41-NEXT: movdqa 48(%rdi), %xmm2 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB85_1 -; SSE41-NEXT: # %bb.2: -; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: jmp .LBB85_3 -; SSE41-NEXT: .LBB85_1: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: addss %xmm3, %xmm3 -; SSE41-NEXT: .LBB85_3: -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB85_4 -; SSE41-NEXT: # %bb.5: -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: jmp .LBB85_6 -; SSE41-NEXT: .LBB85_4: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: addss %xmm0, %xmm0 -; SSE41-NEXT: .LBB85_6: -; SSE41-NEXT: movq %xmm4, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB85_7 -; SSE41-NEXT: # %bb.8: -; SSE41-NEXT: cvtsi2ss %rax, %xmm5 -; SSE41-NEXT: jmp .LBB85_9 -; SSE41-NEXT: .LBB85_7: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm5 -; SSE41-NEXT: addss %xmm5, %xmm5 -; SSE41-NEXT: .LBB85_9: -; SSE41-NEXT: pextrq $1, %xmm4, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB85_10 -; SSE41-NEXT: # %bb.11: -; SSE41-NEXT: xorps %xmm4, %xmm4 -; SSE41-NEXT: cvtsi2ss %rax, %xmm4 -; SSE41-NEXT: jmp .LBB85_12 -; SSE41-NEXT: .LBB85_10: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm4, %xmm4 -; SSE41-NEXT: cvtsi2ss %rax, %xmm4 -; SSE41-NEXT: addss %xmm4, %xmm4 -; SSE41-NEXT: .LBB85_12: -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB85_13 -; SSE41-NEXT: # %bb.14: -; SSE41-NEXT: cvtsi2ss %rax, %xmm6 -; SSE41-NEXT: jmp .LBB85_15 -; SSE41-NEXT: .LBB85_13: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: cvtsi2ss %rax, %xmm6 -; SSE41-NEXT: addss %xmm6, %xmm6 -; SSE41-NEXT: .LBB85_15: -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB85_16 -; SSE41-NEXT: # %bb.17: -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: jmp .LBB85_18 -; SSE41-NEXT: .LBB85_16: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: cvtsi2ss %rax, %xmm1 -; SSE41-NEXT: addss %xmm1, %xmm1 -; SSE41-NEXT: .LBB85_18: -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0],xmm0[3] -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB85_19 -; SSE41-NEXT: # %bb.20: -; SSE41-NEXT: xorps %xmm3, %xmm3 -; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: jmp .LBB85_21 -; SSE41-NEXT: .LBB85_19: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm3, %xmm3 -; SSE41-NEXT: cvtsi2ss %rax, %xmm3 -; SSE41-NEXT: addss %xmm3, %xmm3 -; SSE41-NEXT: .LBB85_21: -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] -; SSE41-NEXT: pextrq $1, %xmm2, %rax -; SSE41-NEXT: testq %rax, %rax -; SSE41-NEXT: js .LBB85_22 -; SSE41-NEXT: # %bb.23: -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; SSE41-NEXT: retq -; SSE41-NEXT: .LBB85_22: -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shrq %rcx -; SSE41-NEXT: andl $1, %eax -; SSE41-NEXT: orq %rcx, %rax -; SSE41-NEXT: xorps %xmm2, %xmm2 -; SSE41-NEXT: cvtsi2ss %rax, %xmm2 -; SSE41-NEXT: addss %xmm2, %xmm2 -; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] -; SSE41-NEXT: retq -; -; VEX-LABEL: uitofp_load_8i64_to_8f32: -; VEX: # %bb.0: -; VEX-NEXT: vmovdqa (%rdi), %xmm1 -; VEX-NEXT: vmovdqa 16(%rdi), %xmm0 -; VEX-NEXT: vmovdqa 32(%rdi), %xmm4 -; VEX-NEXT: vmovdqa 48(%rdi), %xmm3 -; VEX-NEXT: vpextrq $1, %xmm4, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB85_1 -; VEX-NEXT: # %bb.2: -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; VEX-NEXT: jmp .LBB85_3 -; VEX-NEXT: .LBB85_1: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 -; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB85_3: -; VEX-NEXT: vmovq %xmm4, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB85_4 -; VEX-NEXT: # %bb.5: -; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 -; VEX-NEXT: jmp .LBB85_6 -; VEX-NEXT: .LBB85_4: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 -; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm5 -; VEX-NEXT: .LBB85_6: -; VEX-NEXT: vmovq %xmm3, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB85_7 -; VEX-NEXT: # %bb.8: -; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 -; VEX-NEXT: jmp .LBB85_9 -; VEX-NEXT: .LBB85_7: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 -; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm4 -; VEX-NEXT: .LBB85_9: -; VEX-NEXT: vpextrq $1, %xmm3, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB85_10 -; VEX-NEXT: # %bb.11: -; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 -; VEX-NEXT: jmp .LBB85_12 -; VEX-NEXT: .LBB85_10: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 -; VEX-NEXT: vaddss %xmm3, %xmm3, %xmm3 -; VEX-NEXT: .LBB85_12: -; VEX-NEXT: vpextrq $1, %xmm1, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB85_13 -; VEX-NEXT: # %bb.14: -; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6 -; VEX-NEXT: jmp .LBB85_15 -; VEX-NEXT: .LBB85_13: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6 -; VEX-NEXT: vaddss %xmm6, %xmm6, %xmm6 -; VEX-NEXT: .LBB85_15: -; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[2,3] -; VEX-NEXT: vmovq %xmm1, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB85_16 -; VEX-NEXT: # %bb.17: -; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 -; VEX-NEXT: jmp .LBB85_18 -; VEX-NEXT: .LBB85_16: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1 -; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1 -; VEX-NEXT: .LBB85_18: -; VEX-NEXT: vinsertps {{.*#+}} xmm5 = xmm1[0],xmm6[0],xmm1[2,3] -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm4[0],xmm2[3] -; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB85_19 -; VEX-NEXT: # %bb.20: -; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 -; VEX-NEXT: jmp .LBB85_21 -; VEX-NEXT: .LBB85_19: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 -; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2 -; VEX-NEXT: .LBB85_21: -; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1],xmm2[0],xmm5[3] -; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0] -; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: testq %rax, %rax -; VEX-NEXT: js .LBB85_22 -; VEX-NEXT: # %bb.23: -; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 -; VEX-NEXT: jmp .LBB85_24 -; VEX-NEXT: .LBB85_22: -; VEX-NEXT: movq %rax, %rcx -; VEX-NEXT: shrq %rcx -; VEX-NEXT: andl $1, %eax -; VEX-NEXT: orq %rcx, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0 -; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0 -; VEX-NEXT: .LBB85_24: -; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] -; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: uitofp_load_8i64_to_8f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512F-NEXT: vpextrq $1, %xmm2, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 -; AVX512F-NEXT: vmovq %xmm2, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] -; AVX512F-NEXT: vmovq %xmm3, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; AVX512F-NEXT: vpextrq $1, %xmm3, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] -; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] -; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 -; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_load_8i64_to_8f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 -; AVX512VL-NEXT: vmovq %xmm2, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] -; AVX512VL-NEXT: vmovq %xmm3, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] -; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] -; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] -; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3] -; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 -; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vcvtuqq2ps (%rdi), %ymm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_load_8i64_to_8f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtuqq2ps (%rdi), %ymm0 -; AVX512VLDQ-NEXT: retq - %ld = load <8 x i64>, <8 x i64> *%a - %cvt = uitofp <8 x i64> %ld to <8 x float> - ret <8 x float> %cvt -} - -define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) { -; SSE2-LABEL: uitofp_load_8i32_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa 16(%rdi), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] -; SSE2-NEXT: addps %xmm6, %xmm0 -; SSE2-NEXT: addps %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: addps %xmm6, %xmm1 -; SSE2-NEXT: addps %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_8i32_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa (%rdi), %xmm0 -; SSE41-NEXT: movdqa 16(%rdi), %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] -; SSE41-NEXT: movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] -; SSE41-NEXT: addps %xmm5, %xmm0 -; SSE41-NEXT: addps %xmm3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] -; SSE41-NEXT: addps %xmm5, %xmm1 -; SSE41-NEXT: addps %xmm2, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_load_8i32_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps (%rdi), %ymm0 -; AVX1-NEXT: vmovdqa (%rdi), %xmm1 -; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2 -; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1 -; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_8i32_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200] -; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] -; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] -; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: uitofp_load_8i32_to_8f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovaps (%rdi), %ymm0 -; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: uitofp_load_8i32_to_8f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtudq2ps (%rdi), %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: uitofp_load_8i32_to_8f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0 -; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: uitofp_load_8i32_to_8f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtudq2ps (%rdi), %ymm0 -; AVX512VLDQ-NEXT: retq - %ld = load <8 x i32>, <8 x i32> *%a - %cvt = uitofp <8 x i32> %ld to <8 x float> - ret <8 x float> %cvt -} - -define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) { -; SSE2-LABEL: uitofp_load_8i16_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_8i16_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_load_8i16_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_8i16_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: uitofp_load_8i16_to_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: retq - %ld = load <8 x i16>, <8 x i16> *%a - %cvt = uitofp <8 x i16> %ld to <8 x float> - ret <8 x float> %cvt -} - -define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) { -; SSE2-LABEL: uitofp_load_8i8_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE2-NEXT: retq -; -; SSE41-LABEL: uitofp_load_8i8_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: uitofp_load_8i8_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: uitofp_load_8i8_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: uitofp_load_8i8_to_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: retq - %ld = load <8 x i8>, <8 x i8> *%a - %cvt = uitofp <8 x i8> %ld to <8 x float> - ret <8 x float> %cvt -} - -; -; Aggregates -; - -%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }> -define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) { -; SSE2-LABEL: aggregate_sitofp_8i16_to_8f32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq 24(%rdi), %rax -; SSE2-NEXT: movdqu 8(%rdi), %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE2-NEXT: movaps %xmm0, 16(%rax) -; SSE2-NEXT: movaps %xmm1, (%rax) -; SSE2-NEXT: retq -; -; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32: -; SSE41: # %bb.0: -; SSE41-NEXT: movq 24(%rdi), %rax -; SSE41-NEXT: pmovsxwd 16(%rdi), %xmm0 -; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1 -; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE41-NEXT: movaps %xmm0, 16(%rax) -; SSE41-NEXT: movaps %xmm1, (%rax) -; SSE41-NEXT: retq -; -; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: movq 24(%rdi), %rax -; AVX1-NEXT: vpmovsxwd 16(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX1-NEXT: vmovaps %ymm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: movq 24(%rdi), %rax -; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0 -; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX2-NEXT: vmovaps %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: aggregate_sitofp_8i16_to_8f32: -; AVX512: # %bb.0: -; AVX512-NEXT: movq 24(%rdi), %rax -; AVX512-NEXT: vpmovsxwd 8(%rdi), %ymm0 -; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0 -; AVX512-NEXT: vmovaps %ymm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = load %Arguments, %Arguments* %a0, align 1 - %2 = extractvalue %Arguments %1, 1 - %3 = extractvalue %Arguments %1, 2 - %4 = sitofp <8 x i16> %2 to <8 x float> - store <8 x float> %4, <8 x float>* %3, align 32 - ret void -} - -define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind { -; SSE-LABEL: sitofp_i32_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: cvtsi2sd %edi, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_i32_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 -; AVX-NEXT: retq - %cvt = sitofp i32 %a1 to double - %res = insertelement <2 x double> %a0, double %cvt, i32 0 - ret <2 x double> %res -} - -define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind { -; SSE-LABEL: sitofp_i32_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: cvtsi2ss %edi, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_i32_to_4f32: -; AVX: # %bb.0: -; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 -; AVX-NEXT: retq - %cvt = sitofp i32 %a1 to float - %res = insertelement <4 x float> %a0, float %cvt, i32 0 - ret <4 x float> %res -} - -define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind { -; SSE-LABEL: sitofp_i64_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: cvtsi2sd %rdi, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_i64_to_2f64: -; AVX: # %bb.0: -; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 -; AVX-NEXT: retq - %cvt = sitofp i64 %a1 to double - %res = insertelement <2 x double> %a0, double %cvt, i32 0 - ret <2 x double> %res -} - -define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind { -; SSE-LABEL: sitofp_i64_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: cvtsi2ss %rdi, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: sitofp_i64_to_4f32: -; AVX: # %bb.0: -; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 -; AVX-NEXT: retq - %cvt = sitofp i64 %a1 to float - %res = insertelement <4 x float> %a0, float %cvt, i32 0 - ret <4 x float> %res -} - -; Extract from int vector and convert to FP. - -define float @extract0_sitofp_v4i32_f32(<4 x i32> %x) nounwind { -; SSE-LABEL: extract0_sitofp_v4i32_f32: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: extract0_sitofp_v4i32_f32: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq - %e = extractelement <4 x i32> %x, i32 0 - %r = sitofp i32 %e to float - ret float %r -} - -define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind { -; SSE-LABEL: extract0_sitofp_v4i32_f32i_multiuse1: -; SSE: # %bb.0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: incl %eax -; SSE-NEXT: cvtsi2ss %eax, %xmm1 -; SSE-NEXT: divss %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: extract0_sitofp_v4i32_f32i_multiuse1: -; AVX: # %bb.0: -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: incl %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 -; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq - %e = extractelement <4 x i32> %x, i32 0 - %f = sitofp i32 %e to float - %e1 = add i32 %e, 1 - %f1 = sitofp i32 %e1 to float - %r = fdiv float %f, %f1 - ret float %r -} - -define float @extract0_sitofp_v4i32_f32_multiuse2(<4 x i32> %x, i32* %p) nounwind { -; SSE-LABEL: extract0_sitofp_v4i32_f32_multiuse2: -; SSE: # %bb.0: -; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE-NEXT: movss %xmm0, (%rdi) -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm1 -; AVX-NEXT: vmovss %xmm0, (%rdi) -; AVX-NEXT: vmovaps %xmm1, %xmm0 -; AVX-NEXT: retq - %e = extractelement <4 x i32> %x, i32 0 - %r = sitofp i32 %e to float - store i32 %e, i32* %p - ret float %r -} - -define double @extract0_sitofp_v4i32_f64(<4 x i32> %x) nounwind { -; SSE-LABEL: extract0_sitofp_v4i32_f64: -; SSE: # %bb.0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2sd %eax, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: extract0_sitofp_v4i32_f64: -; AVX: # %bb.0: -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %e = extractelement <4 x i32> %x, i32 0 - %r = sitofp i32 %e to double - ret double %r -} - -define float @extract0_uitofp_v4i32_f32(<4 x i32> %x) nounwind { -; SSE-LABEL: extract0_uitofp_v4i32_f32: -; SSE: # %bb.0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ss %rax, %xmm0 -; SSE-NEXT: retq -; -; VEX-LABEL: extract0_uitofp_v4i32_f32: -; VEX: # %bb.0: -; VEX-NEXT: vmovd %xmm0, %eax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: extract0_uitofp_v4i32_f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: extract0_uitofp_v4i32_f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: extract0_uitofp_v4i32_f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %e = extractelement <4 x i32> %x, i32 0 - %r = uitofp i32 %e to float - ret float %r -} - -define double @extract0_uitofp_v4i32_f64(<4 x i32> %x) nounwind { -; SSE-LABEL: extract0_uitofp_v4i32_f64: -; SSE: # %bb.0: -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2sd %rax, %xmm0 -; SSE-NEXT: retq -; -; VEX-LABEL: extract0_uitofp_v4i32_f64: -; VEX: # %bb.0: -; VEX-NEXT: vmovd %xmm0, %eax -; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: extract0_uitofp_v4i32_f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: extract0_uitofp_v4i32_f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: extract0_uitofp_v4i32_f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %e = extractelement <4 x i32> %x, i32 0 - %r = uitofp i32 %e to double - ret double %r -} - -; Extract non-zero element from int vector and convert to FP. - -define float @extract3_sitofp_v4i32_f32(<4 x i32> %x) nounwind { -; SSE-LABEL: extract3_sitofp_v4i32_f32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: extract3_sitofp_v4i32_f32: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 -; AVX-NEXT: retq - %e = extractelement <4 x i32> %x, i32 3 - %r = sitofp i32 %e to float - ret float %r -} - -define double @extract3_sitofp_v4i32_f64(<4 x i32> %x) nounwind { -; SSE2-LABEL: extract3_sitofp_v4i32_f64: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sd %eax, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract3_sitofp_v4i32_f64: -; SSE41: # %bb.0: -; SSE41-NEXT: extractps $3, %xmm0, %eax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2sd %eax, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: extract3_sitofp_v4i32_f64: -; AVX: # %bb.0: -; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0 -; AVX-NEXT: retq - %e = extractelement <4 x i32> %x, i32 3 - %r = sitofp i32 %e to double - ret double %r -} - -define float @extract3_uitofp_v4i32_f32(<4 x i32> %x) nounwind { -; SSE2-LABEL: extract3_uitofp_v4i32_f32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2ss %rax, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract3_uitofp_v4i32_f32: -; SSE41: # %bb.0: -; SSE41-NEXT: extractps $3, %xmm0, %eax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2ss %rax, %xmm0 -; SSE41-NEXT: retq -; -; VEX-LABEL: extract3_uitofp_v4i32_f32: -; VEX: # %bb.0: -; VEX-NEXT: vextractps $3, %xmm0, %eax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: extract3_uitofp_v4i32_f32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: extract3_uitofp_v4i32_f32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %e = extractelement <4 x i32> %x, i32 3 - %r = uitofp i32 %e to float - ret float %r -} - -define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind { -; SSE2-LABEL: extract3_uitofp_v4i32_f64: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorps %xmm0, %xmm0 -; SSE2-NEXT: cvtsi2sd %rax, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: extract3_uitofp_v4i32_f64: -; SSE41: # %bb.0: -; SSE41-NEXT: extractps $3, %xmm0, %eax -; SSE41-NEXT: xorps %xmm0, %xmm0 -; SSE41-NEXT: cvtsi2sd %rax, %xmm0 -; SSE41-NEXT: retq -; -; VEX-LABEL: extract3_uitofp_v4i32_f64: -; VEX: # %bb.0: -; VEX-NEXT: vextractps $3, %xmm0, %eax -; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 -; VEX-NEXT: retq -; -; AVX512F-LABEL: extract3_uitofp_v4i32_f64: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: extract3_uitofp_v4i32_f64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64: -; AVX512VLDQ: # %bb.0: -; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] -; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0 -; AVX512VLDQ-NEXT: retq - %e = extractelement <4 x i32> %x, i32 3 - %r = uitofp i32 %e to double - ret double %r -} - diff --git a/test/CodeGen/X86/vector-idiv-v2i32.ll b/test/CodeGen/X86/vector-idiv-v2i32.ll index a76c4d10afa..6f6776ea89e 100644 --- a/test/CodeGen/X86/vector-idiv-v2i32.ll +++ b/test/CodeGen/X86/vector-idiv-v2i32.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X64_WIDEN -; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X86_WIDEN define void @test_udiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X64-LABEL: test_udiv7_v2i32: @@ -43,45 +41,6 @@ define void @test_udiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86-NEXT: psrld $2, %xmm0 ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl -; -; X64_WIDEN-LABEL: test_udiv7_v2i32: -; X64_WIDEN: # %bb.0: -; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2 -; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2 -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3 -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64_WIDEN-NEXT: psubd %xmm2, %xmm0 -; X64_WIDEN-NEXT: psrld $1, %xmm0 -; X64_WIDEN-NEXT: paddd %xmm2, %xmm0 -; X64_WIDEN-NEXT: psrld $2, %xmm0 -; X64_WIDEN-NEXT: movq %xmm0, (%rsi) -; X64_WIDEN-NEXT: retq -; -; X86_WIDEN-LABEL: test_udiv7_v2i32: -; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2 -; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2 -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3 -; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] -; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3 -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86_WIDEN-NEXT: psubd %xmm2, %xmm0 -; X86_WIDEN-NEXT: psrld $1, %xmm0 -; X86_WIDEN-NEXT: paddd %xmm2, %xmm0 -; X86_WIDEN-NEXT: psrld $2, %xmm0 -; X86_WIDEN-NEXT: movq %xmm0, (%eax) -; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = udiv <2 x i32> %a, store <2 x i32> %b, <2 x i32>* %y @@ -137,55 +96,6 @@ define void @test_urem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86-NEXT: paddd %xmm0, %xmm1 ; X86-NEXT: movq %xmm1, (%eax) ; X86-NEXT: retl -; -; X64_WIDEN-LABEL: test_urem7_v2i32: -; X64_WIDEN: # %bb.0: -; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2 -; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2 -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3 -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X64_WIDEN-NEXT: movdqa %xmm0, %xmm1 -; X64_WIDEN-NEXT: psubd %xmm2, %xmm1 -; X64_WIDEN-NEXT: psrld $1, %xmm1 -; X64_WIDEN-NEXT: paddd %xmm2, %xmm1 -; X64_WIDEN-NEXT: psrld $2, %xmm1 -; X64_WIDEN-NEXT: movdqa %xmm1, %xmm2 -; X64_WIDEN-NEXT: pslld $3, %xmm2 -; X64_WIDEN-NEXT: psubd %xmm2, %xmm1 -; X64_WIDEN-NEXT: paddd %xmm0, %xmm1 -; X64_WIDEN-NEXT: movq %xmm1, (%rsi) -; X64_WIDEN-NEXT: retq -; -; X86_WIDEN-LABEL: test_urem7_v2i32: -; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757] -; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2 -; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2 -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3 -; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] -; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3 -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] -; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; X86_WIDEN-NEXT: movdqa %xmm0, %xmm1 -; X86_WIDEN-NEXT: psubd %xmm2, %xmm1 -; X86_WIDEN-NEXT: psrld $1, %xmm1 -; X86_WIDEN-NEXT: paddd %xmm2, %xmm1 -; X86_WIDEN-NEXT: psrld $2, %xmm1 -; X86_WIDEN-NEXT: movdqa %xmm1, %xmm2 -; X86_WIDEN-NEXT: pslld $3, %xmm2 -; X86_WIDEN-NEXT: psubd %xmm2, %xmm1 -; X86_WIDEN-NEXT: paddd %xmm0, %xmm1 -; X86_WIDEN-NEXT: movq %xmm1, (%eax) -; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = urem <2 x i32> %a, store <2 x i32> %b, <2 x i32>* %y @@ -243,57 +153,6 @@ define void @test_sdiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86-NEXT: paddd %xmm0, %xmm2 ; X86-NEXT: movq %xmm2, (%eax) ; X86-NEXT: retl -; -; X64_WIDEN-LABEL: test_sdiv7_v2i32: -; X64_WIDEN: # %bb.0: -; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2 -; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2 -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3 -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X64_WIDEN-NEXT: pxor %xmm3, %xmm3 -; X64_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3 -; X64_WIDEN-NEXT: pand %xmm1, %xmm3 -; X64_WIDEN-NEXT: paddd %xmm0, %xmm3 -; X64_WIDEN-NEXT: psubd %xmm3, %xmm2 -; X64_WIDEN-NEXT: paddd %xmm0, %xmm2 -; X64_WIDEN-NEXT: movdqa %xmm2, %xmm0 -; X64_WIDEN-NEXT: psrld $31, %xmm0 -; X64_WIDEN-NEXT: psrad $2, %xmm2 -; X64_WIDEN-NEXT: paddd %xmm0, %xmm2 -; X64_WIDEN-NEXT: movq %xmm2, (%rsi) -; X64_WIDEN-NEXT: retq -; -; X86_WIDEN-LABEL: test_sdiv7_v2i32: -; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2 -; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2 -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3 -; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] -; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3 -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X86_WIDEN-NEXT: pxor %xmm3, %xmm3 -; X86_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3 -; X86_WIDEN-NEXT: pand %xmm1, %xmm3 -; X86_WIDEN-NEXT: paddd %xmm0, %xmm3 -; X86_WIDEN-NEXT: psubd %xmm3, %xmm2 -; X86_WIDEN-NEXT: paddd %xmm0, %xmm2 -; X86_WIDEN-NEXT: movdqa %xmm2, %xmm0 -; X86_WIDEN-NEXT: psrld $31, %xmm0 -; X86_WIDEN-NEXT: psrad $2, %xmm2 -; X86_WIDEN-NEXT: paddd %xmm0, %xmm2 -; X86_WIDEN-NEXT: movq %xmm2, (%eax) -; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = sdiv <2 x i32> %a, store <2 x i32> %b, <2 x i32>* %y @@ -359,65 +218,6 @@ define void @test_srem7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86-NEXT: paddd %xmm0, %xmm2 ; X86-NEXT: movq %xmm2, (%eax) ; X86-NEXT: retl -; -; X64_WIDEN-LABEL: test_srem7_v2i32: -; X64_WIDEN: # %bb.0: -; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2 -; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2 -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3 -; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X64_WIDEN-NEXT: pxor %xmm3, %xmm3 -; X64_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3 -; X64_WIDEN-NEXT: pand %xmm1, %xmm3 -; X64_WIDEN-NEXT: paddd %xmm0, %xmm3 -; X64_WIDEN-NEXT: psubd %xmm3, %xmm2 -; X64_WIDEN-NEXT: paddd %xmm0, %xmm2 -; X64_WIDEN-NEXT: movdqa %xmm2, %xmm1 -; X64_WIDEN-NEXT: psrld $31, %xmm1 -; X64_WIDEN-NEXT: psrad $2, %xmm2 -; X64_WIDEN-NEXT: paddd %xmm1, %xmm2 -; X64_WIDEN-NEXT: movdqa %xmm2, %xmm1 -; X64_WIDEN-NEXT: pslld $3, %xmm1 -; X64_WIDEN-NEXT: psubd %xmm1, %xmm2 -; X64_WIDEN-NEXT: paddd %xmm0, %xmm2 -; X64_WIDEN-NEXT: movq %xmm2, (%rsi) -; X64_WIDEN-NEXT: retq -; -; X86_WIDEN-LABEL: test_srem7_v2i32: -; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027] -; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2 -; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2 -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3 -; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3] -; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3 -; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; X86_WIDEN-NEXT: pxor %xmm3, %xmm3 -; X86_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3 -; X86_WIDEN-NEXT: pand %xmm1, %xmm3 -; X86_WIDEN-NEXT: paddd %xmm0, %xmm3 -; X86_WIDEN-NEXT: psubd %xmm3, %xmm2 -; X86_WIDEN-NEXT: paddd %xmm0, %xmm2 -; X86_WIDEN-NEXT: movdqa %xmm2, %xmm1 -; X86_WIDEN-NEXT: psrld $31, %xmm1 -; X86_WIDEN-NEXT: psrad $2, %xmm2 -; X86_WIDEN-NEXT: paddd %xmm1, %xmm2 -; X86_WIDEN-NEXT: movdqa %xmm2, %xmm1 -; X86_WIDEN-NEXT: pslld $3, %xmm1 -; X86_WIDEN-NEXT: psubd %xmm1, %xmm2 -; X86_WIDEN-NEXT: paddd %xmm0, %xmm2 -; X86_WIDEN-NEXT: movq %xmm2, (%eax) -; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = srem <2 x i32> %a, store <2 x i32> %b, <2 x i32>* %y @@ -440,22 +240,6 @@ define void @test_udiv_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind { ; X86-NEXT: psrld $3, %xmm0 ; X86-NEXT: movq %xmm0, (%eax) ; X86-NEXT: retl -; -; X64_WIDEN-LABEL: test_udiv_pow2_v2i32: -; X64_WIDEN: # %bb.0: -; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64_WIDEN-NEXT: psrld $3, %xmm0 -; X64_WIDEN-NEXT: movq %xmm0, (%rsi) -; X64_WIDEN-NEXT: retq -; -; X86_WIDEN-LABEL: test_udiv_pow2_v2i32: -; X86_WIDEN: # %bb.0: -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X86_WIDEN-NEXT: psrld $3, %xmm0 -; X86_WIDEN-NEXT: movq %xmm0, (%eax) -; X86_WIDEN-NEXT: retl %a = load <2 x i32>, <2 x i32>* %x %b = udiv <2 x i32> %a, store <2 x i32> %b, <2 x i32>* %y diff --git a/test/CodeGen/X86/vec_clz.ll b/test/CodeGen/X86/vector-lzcnt-sub128.ll similarity index 96% rename from test/CodeGen/X86/vec_clz.ll rename to test/CodeGen/X86/vector-lzcnt-sub128.ll index 6e64641f782..c1e7e42ac7e 100644 --- a/test/CodeGen/X86/vec_clz.ll +++ b/test/CodeGen/X86/vector-lzcnt-sub128.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-experimental-vector-widening-legalization | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1 immarg) diff --git a/test/CodeGen/X86/vector-reduce-add-widen.ll b/test/CodeGen/X86/vector-reduce-add-widen.ll deleted file mode 100644 index 0841acfc41a..00000000000 --- a/test/CodeGen/X86/vector-reduce-add-widen.ll +++ /dev/null @@ -1,1386 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL - -; -; vXi64 -; - -define i64 @test_v2i64(<2 x i64> %a0) { -; SSE-LABEL: test_v2i64: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v4i64(<4 x i64> %a0) { -; SSE-LABEL: test_v4i64: -; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v8i64(<8 x i64> %a0) { -; SSE-LABEL: test_v8i64: -; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm2, %xmm1 -; SSE-NEXT: paddq %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: paddq %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v8i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v16i64(<16 x i64> %a0) { -; SSE-LABEL: test_v16i64: -; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm5, %xmm3 -; SSE-NEXT: paddq %xmm1, %xmm3 -; SSE-NEXT: paddq %xmm4, %xmm2 -; SSE-NEXT: paddq %xmm3, %xmm2 -; SSE-NEXT: paddq %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %a0) - ret i64 %1 -} - -; -; vXi32 -; - -define i32 @test_v2i32(<2 x i32> %a0) { -; SSE-LABEL: test_v2i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v2i32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v2i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v4i32(<4 x i32> %a0) { -; SSE-LABEL: test_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v4i32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v4i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v8i32(<8 x i32> %a0) { -; SSE-LABEL: test_v8i32: -; SSE: # %bb.0: -; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v8i32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v8i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v16i32(<16 x i32> %a0) { -; SSE-LABEL: test_v16i32: -; SSE: # %bb.0: -; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: paddd %xmm2, %xmm1 -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: paddd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v16i32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v16i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2 -; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v16i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v32i32(<32 x i32> %a0) { -; SSE-LABEL: test_v32i32: -; SSE: # %bb.0: -; SSE-NEXT: paddd %xmm6, %xmm2 -; SSE-NEXT: paddd %xmm7, %xmm3 -; SSE-NEXT: paddd %xmm5, %xmm3 -; SSE-NEXT: paddd %xmm1, %xmm3 -; SSE-NEXT: paddd %xmm4, %xmm2 -; SSE-NEXT: paddd %xmm3, %xmm2 -; SSE-NEXT: paddd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: paddd %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v32i32: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v32i32: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v32i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> %a0) - ret i32 %1 -} - -; -; vXi16 -; - -define i16 @test_v2i16(<2 x i16> %a0) { -; SSE-LABEL: test_v2i16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v2i16: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v2i16: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v4i16(<4 x i16> %a0) { -; SSE-LABEL: test_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v4i16: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v4i16: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v8i16(<8 x i16> %a0) { -; SSE-LABEL: test_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v8i16: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v8i16: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v16i16(<16 x i16> %a0) { -; SSE-LABEL: test_v16i16: -; SSE: # %bb.0: -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v16i16: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v16i16: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v32i16(<32 x i16> %a0) { -; SSE-LABEL: test_v32i16: -; SSE: # %bb.0: -; SSE-NEXT: paddw %xmm3, %xmm1 -; SSE-NEXT: paddw %xmm2, %xmm1 -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v32i16: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2 -; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v32i16: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2 -; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v32i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v64i16(<64 x i16> %a0) { -; SSE-LABEL: test_v64i16: -; SSE: # %bb.0: -; SSE-NEXT: paddw %xmm6, %xmm2 -; SSE-NEXT: paddw %xmm7, %xmm3 -; SSE-NEXT: paddw %xmm5, %xmm3 -; SSE-NEXT: paddw %xmm1, %xmm3 -; SSE-NEXT: paddw %xmm4, %xmm2 -; SSE-NEXT: paddw %xmm3, %xmm2 -; SSE-NEXT: paddw %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: paddw %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: paddw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-SLOW-LABEL: test_v64i16: -; AVX1-SLOW: # %bb.0: -; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm4 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm2, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-SLOW-NEXT: vmovd %xmm0, %eax -; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-SLOW-NEXT: vzeroupper -; AVX1-SLOW-NEXT: retq -; -; AVX1-FAST-LABEL: test_v64i16: -; AVX1-FAST: # %bb.0: -; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm4 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm2, %xmm1 -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0 -; AVX1-FAST-NEXT: vmovd %xmm0, %eax -; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-FAST-NEXT: vzeroupper -; AVX1-FAST-NEXT: retq -; -; AVX2-LABEL: test_v64i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> %a0) - ret i16 %1 -} - -; -; vXi8 -; - -define i8 @test_v2i8(<2 x i8> %a0) { -; SSE2-LABEL: test_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v4i8(<4 x i8> %a0) { -; SSE2-LABEL: test_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: paddb %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v8i8(<8 x i8> %a0) { -; SSE2-LABEL: test_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: paddb %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v16i8(<16 x i8> %a0) { -; SSE2-LABEL: test_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v32i8(<32 x i8> %a0) { -; SSE2-LABEL: test_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: psadbw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: paddb %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: psadbw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v64i8(<64 x i8> %a0) { -; SSE2-LABEL: test_v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: paddb %xmm3, %xmm1 -; SSE2-NEXT: paddb %xmm2, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddb %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: paddb %xmm3, %xmm1 -; SSE41-NEXT: paddb %xmm2, %xmm1 -; SSE41-NEXT: paddb %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: paddb %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: psadbw %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v128i8(<128 x i8> %a0) { -; SSE2-LABEL: test_v128i8: -; SSE2: # %bb.0: -; SSE2-NEXT: paddb %xmm7, %xmm3 -; SSE2-NEXT: paddb %xmm5, %xmm3 -; SSE2-NEXT: paddb %xmm1, %xmm3 -; SSE2-NEXT: paddb %xmm6, %xmm2 -; SSE2-NEXT: paddb %xmm4, %xmm2 -; SSE2-NEXT: paddb %xmm3, %xmm2 -; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: psadbw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v128i8: -; SSE41: # %bb.0: -; SSE41-NEXT: paddb %xmm7, %xmm3 -; SSE41-NEXT: paddb %xmm5, %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm3 -; SSE41-NEXT: paddb %xmm6, %xmm2 -; SSE41-NEXT: paddb %xmm4, %xmm2 -; SSE41-NEXT: paddb %xmm3, %xmm2 -; SSE41-NEXT: paddb %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: paddb %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: psadbw %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v128i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v128i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v128i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> %a0) - ret i8 %1 -} - -declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>) - -declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>) - -declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>) - -declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>) diff --git a/test/CodeGen/X86/vector-reduce-and-widen.ll b/test/CodeGen/X86/vector-reduce-and-widen.ll deleted file mode 100644 index d169f0cb138..00000000000 --- a/test/CodeGen/X86/vector-reduce-and-widen.ll +++ /dev/null @@ -1,1168 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 - -; -; vXi64 -; - -define i64 @test_v2i64(<2 x i64> %a0) { -; SSE-LABEL: test_v2i64: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v4i64(<4 x i64> %a0) { -; SSE-LABEL: test_v4i64: -; SSE: # %bb.0: -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v8i64(<8 x i64> %a0) { -; SSE-LABEL: test_v8i64: -; SSE: # %bb.0: -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v8i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v16i64(<16 x i64> %a0) { -; SSE-LABEL: test_v16i64: -; SSE: # %bb.0: -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> %a0) - ret i64 %1 -} - -; -; vXi32 -; - -define i32 @test_v2i32(<2 x i32> %a0) { -; SSE-LABEL: test_v2i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v4i32(<4 x i32> %a0) { -; SSE-LABEL: test_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v8i32(<8 x i32> %a0) { -; SSE-LABEL: test_v8i32: -; SSE: # %bb.0: -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v16i32(<16 x i32> %a0) { -; SSE-LABEL: test_v16i32: -; SSE: # %bb.0: -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v32i32(<32 x i32> %a0) { -; SSE-LABEL: test_v32i32: -; SSE: # %bb.0: -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v32i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> %a0) - ret i32 %1 -} - -; -; vXi16 -; - -define i16 @test_v2i16(<2 x i16> %a0) { -; SSE-LABEL: test_v2i16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v4i16(<4 x i16> %a0) { -; SSE-LABEL: test_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v8i16(<8 x i16> %a0) { -; SSE-LABEL: test_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v16i16(<16 x i16> %a0) { -; SSE-LABEL: test_v16i16: -; SSE: # %bb.0: -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v32i16(<32 x i16> %a0) { -; SSE-LABEL: test_v32i16: -; SSE: # %bb.0: -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v32i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v64i16(<64 x i16> %a0) { -; SSE-LABEL: test_v64i16: -; SSE: # %bb.0: -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm5, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm3, %xmm2 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v64i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> %a0) - ret i16 %1 -} - -; -; vXi8 -; - -define i8 @test_v2i8(<2 x i8> %a0) { -; SSE2-LABEL: test_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v4i8(<4 x i8> %a0) { -; SSE2-LABEL: test_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v8i8(<8 x i8> %a0) { -; SSE2-LABEL: test_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v16i8(<16 x i8> %a0) { -; SSE2-LABEL: test_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v32i8(<32 x i8> %a0) { -; SSE2-LABEL: test_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v64i8(<64 x i8> %a0) { -; SSE2-LABEL: test_v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm3, %xmm1 -; SSE41-NEXT: pand %xmm2, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v128i8(<128 x i8> %a0) { -; SSE2-LABEL: test_v128i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v128i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pand %xmm6, %xmm2 -; SSE41-NEXT: pand %xmm7, %xmm3 -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: pand %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pand %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v128i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v128i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v128i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> %a0) - ret i8 %1 -} - -declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>) - -declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>) - -declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>) - -declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>) diff --git a/test/CodeGen/X86/vector-reduce-mul-widen.ll b/test/CodeGen/X86/vector-reduce-mul-widen.ll deleted file mode 100644 index 8242160f8ff..00000000000 --- a/test/CodeGen/X86/vector-reduce-mul-widen.ll +++ /dev/null @@ -1,3022 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512BWVL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512DQVL - -; -; vXi64 -; - -define i64 @test_v2i64(<2 x i64> %a0) { -; SSE-LABEL: test_v2i64: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq -; -; AVX512BW-LABEL: test_v2i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v2i64: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, %rax -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v2i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v2i64: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovq %xmm0, %rax -; AVX512DQVL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v4i64(<4 x i64> %a0) { -; SSE-LABEL: test_v4i64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v4i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v4i64: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, %rax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v4i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v4i64: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovq %xmm0, %rax -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v8i64(<8 x i64> %a0) { -; SSE-LABEL: test_v8i64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v8i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1 -; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v8i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v8i64: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, %rax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v8i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v8i64: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovq %xmm0, %rax -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v16i64(<16 x i64> %a0) { -; SSE-LABEL: test_v16i64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm2, %xmm9 -; SSE-NEXT: paddq %xmm8, %xmm9 -; SSE-NEXT: psllq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm0, %xmm6 -; SSE-NEXT: paddq %xmm8, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm3, %xmm6 -; SSE-NEXT: paddq %xmm4, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm1, %xmm6 -; SSE-NEXT: paddq %xmm4, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6 -; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm7 -; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm2 -; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5 -; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v16i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v16i64: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovq %xmm0, %rax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v16i64: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vmovq %xmm0, %rax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v16i64: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovq %xmm0, %rax -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> %a0) - ret i64 %1 -} - -; -; vXi32 -; - -define i32 @test_v2i32(<2 x i32> %a0) { -; SSE2-LABEL: test_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v4i32(<4 x i32> %a0) { -; SSE2-LABEL: test_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v8i32(<8 x i32> %a0) { -; SSE2-LABEL: test_v8i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm3 -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,0,0] -; SSE2-NEXT: pmuludq %xmm3, %xmm0 -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v16i32(<16 x i32> %a0) { -; SSE2-LABEL: test_v16i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm5, %xmm2 -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,0,0] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmulld %xmm3, %xmm1 -; SSE41-NEXT: pmulld %xmm2, %xmm1 -; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v32i32(<32 x i32> %a0) { -; SSE2-LABEL: test_v32i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm10 -; SSE2-NEXT: pmuludq %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm11 -; SSE2-NEXT: pmuludq %xmm9, %xmm11 -; SSE2-NEXT: pmuludq %xmm10, %xmm11 -; SSE2-NEXT: pmuludq %xmm6, %xmm2 -; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pmuludq %xmm7, %xmm3 -; SSE2-NEXT: pmuludq %xmm5, %xmm1 -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,0,0] -; SSE2-NEXT: pmuludq %xmm11, %xmm1 -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmulld %xmm6, %xmm2 -; SSE41-NEXT: pmulld %xmm7, %xmm3 -; SSE41-NEXT: pmulld %xmm5, %xmm3 -; SSE41-NEXT: pmulld %xmm1, %xmm3 -; SSE41-NEXT: pmulld %xmm4, %xmm2 -; SSE41-NEXT: pmulld %xmm3, %xmm2 -; SSE41-NEXT: pmulld %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pmulld %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmulld %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> %a0) - ret i32 %1 -} - -; -; vXi16 -; - -define i16 @test_v2i16(<2 x i16> %a0) { -; SSE-LABEL: test_v2i16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v4i16(<4 x i16> %a0) { -; SSE-LABEL: test_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v8i16(<8 x i16> %a0) { -; SSE-LABEL: test_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v16i16(<16 x i16> %a0) { -; SSE-LABEL: test_v16i16: -; SSE: # %bb.0: -; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v32i16(<32 x i16> %a0) { -; SSE-LABEL: test_v32i16: -; SSE: # %bb.0: -; SSE-NEXT: pmullw %xmm3, %xmm1 -; SSE-NEXT: pmullw %xmm2, %xmm1 -; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v32i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v32i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v32i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovd %xmm0, %eax -; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v32i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v32i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovd %xmm0, %eax -; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v64i16(<64 x i16> %a0) { -; SSE-LABEL: test_v64i16: -; SSE: # %bb.0: -; SSE-NEXT: pmullw %xmm6, %xmm2 -; SSE-NEXT: pmullw %xmm7, %xmm3 -; SSE-NEXT: pmullw %xmm5, %xmm3 -; SSE-NEXT: pmullw %xmm1, %xmm3 -; SSE-NEXT: pmullw %xmm4, %xmm2 -; SSE-NEXT: pmullw %xmm3, %xmm2 -; SSE-NEXT: pmullw %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pmullw %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pmullw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v64i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v64i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v64i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovd %xmm0, %eax -; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v64i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovd %xmm0, %eax -; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v64i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1 -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovd %xmm0, %eax -; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> %a0) - ret i16 %1 -} - -; -; vXi8 -; - -define i8 @test_v2i8(<2 x i8> %a0) { -; SSE2-LABEL: test_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v4i8(<4 x i8> %a0) { -; SSE2-LABEL: test_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero -; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v8i8(<8 x i8> %a0) { -; SSE2-LABEL: test_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,2,3,0] -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[10],zero,xmm1[14],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero -; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero -; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512BW-LABEL: test_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3] -; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero -; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: retq -; -; AVX512DQ-LABEL: test_v8i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmullw %xmm0, %xmm1, %xmm0 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax -; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQ-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v16i8(<16 x i8> %a0) { -; SSE2-LABEL: test_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: packuswb %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: packuswb %xmm2, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmullw %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmullw %xmm3, %xmm0 -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmullw %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,xmm0[4],zero,xmm0[6],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax -; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v16i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v32i8(<32 x i8> %a0) { -; SSE2-LABEL: test_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: pmullw %xmm2, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm3, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: packuswb %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm2, %xmm3 -; SSE41-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: pmullw %xmm1, %xmm3 -; SSE41-NEXT: pmullw %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: packuswb %xmm0, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE41-NEXT: pmullw %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: packuswb %xmm0, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; SSE41-NEXT: pmullw %xmm1, %xmm3 -; SSE41-NEXT: pand %xmm2, %xmm3 -; SSE41-NEXT: packuswb %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pmullw %xmm3, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v32i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v32i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax -; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v32i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v64i8(<64 x i8> %a0) { -; SSE2-LABEL: test_v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE2-NEXT: pmullw %xmm3, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: pmullw %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: pmullw %xmm4, %xmm0 -; SSE2-NEXT: pmullw %xmm5, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: pmullw %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: packuswb %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE41-NEXT: pmullw %xmm3, %xmm1 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE41-NEXT: pmullw %xmm1, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm3, %xmm6 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm3, %xmm6 -; SSE41-NEXT: pmullw %xmm4, %xmm5 -; SSE41-NEXT: pshufb %xmm3, %xmm5 -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; SSE41-NEXT: pmullw %xmm4, %xmm5 -; SSE41-NEXT: pshufb %xmm3, %xmm5 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; SSE41-NEXT: pmullw %xmm2, %xmm3 -; SSE41-NEXT: pmullw %xmm0, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: packuswb %xmm0, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] -; SSE41-NEXT: pmullw %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: packuswb %xmm0, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3] -; SSE41-NEXT: pmullw %xmm2, %xmm3 -; SSE41-NEXT: pand %xmm1, %xmm3 -; SSE41-NEXT: packuswb %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pmullw %xmm3, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v64i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BW-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v64i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v64i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax -; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v64i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v128i8(<128 x i8> %a0) { -; SSE2-LABEL: test_v128i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] -; SSE2-NEXT: pmullw %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm4, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] -; SSE2-NEXT: pmullw %xmm9, %xmm10 -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm7, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm11 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] -; SSE2-NEXT: pmullw %xmm8, %xmm11 -; SSE2-NEXT: movdqa %xmm5, %xmm12 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15] -; SSE2-NEXT: pmullw %xmm11, %xmm12 -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] -; SSE2-NEXT: pmullw %xmm12, %xmm8 -; SSE2-NEXT: pmullw %xmm10, %xmm8 -; SSE2-NEXT: pmullw %xmm9, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm6, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm2, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm7, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm3, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm5, %xmm1 -; SSE2-NEXT: pmullw %xmm4, %xmm1 -; SSE2-NEXT: pmullw %xmm8, %xmm1 -; SSE2-NEXT: pmullw %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: pmullw %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: pmullw %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v128i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE41-NEXT: pmullw %xmm6, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE41-NEXT: pmullw %xmm2, %xmm4 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE41-NEXT: pmullw %xmm7, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE41-NEXT: pmullw %xmm3, %xmm5 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE41-NEXT: pmullw %xmm5, %xmm1 -; SSE41-NEXT: pmullw %xmm4, %xmm1 -; SSE41-NEXT: pmullw %xmm7, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm5, %xmm3 -; SSE41-NEXT: pmullw %xmm11, %xmm6 -; SSE41-NEXT: pshufb %xmm5, %xmm6 -; SSE41-NEXT: pmullw %xmm10, %xmm2 -; SSE41-NEXT: pshufb %xmm5, %xmm2 -; SSE41-NEXT: pmullw %xmm8, %xmm9 -; SSE41-NEXT: pshufb %xmm5, %xmm9 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: pmullw %xmm7, %xmm2 -; SSE41-NEXT: pshufb %xmm5, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: pmullw %xmm6, %xmm3 -; SSE41-NEXT: pshufb %xmm5, %xmm3 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: pmullw %xmm3, %xmm2 -; SSE41-NEXT: pshufb %xmm5, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: pmullw %xmm1, %xmm2 -; SSE41-NEXT: pmullw %xmm0, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: packuswb %xmm0, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE41-NEXT: pmullw %xmm1, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; SSE41-NEXT: pmullw %xmm1, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v128i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm8 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; AVX1-NEXT: vpmullw %xmm7, %xmm5, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; AVX1-NEXT: vpmullw %xmm10, %xmm5, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; AVX1-NEXT: vpmullw %xmm10, %xmm6, %xmm6 -; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6 -; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero -; AVX1-NEXT: vpmullw %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm4 -; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v128i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] -; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v128i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm1 -; AVX512BW-NEXT: vpackuswb %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BW-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512BW-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: test_v128i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm4 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm1 -; AVX512BWVL-NEXT: vpackuswb %zmm4, %zmm1, %zmm1 -; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63] -; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55] -; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; AVX512DQ-LABEL: test_v128i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] -; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpand %ymm1, %ymm4, %ymm1 -; AVX512DQ-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 -; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax -; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512DQVL-LABEL: test_v128i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] -; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpand %ymm1, %ymm4, %ymm1 -; AVX512DQVL-NEXT: vpackuswb %ymm0, %ymm1, %ymm0 -; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3] -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2 -; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> %a0) - ret i8 %1 -} - -declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64>) - -declare i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32>) - -declare i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16>) - -declare i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8>) diff --git a/test/CodeGen/X86/vector-reduce-or-widen.ll b/test/CodeGen/X86/vector-reduce-or-widen.ll deleted file mode 100644 index 720b47371e3..00000000000 --- a/test/CodeGen/X86/vector-reduce-or-widen.ll +++ /dev/null @@ -1,1168 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 - -; -; vXi64 -; - -define i64 @test_v2i64(<2 x i64> %a0) { -; SSE-LABEL: test_v2i64: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v4i64(<4 x i64> %a0) { -; SSE-LABEL: test_v4i64: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v8i64(<8 x i64> %a0) { -; SSE-LABEL: test_v8i64: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v8i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v16i64(<16 x i64> %a0) { -; SSE-LABEL: test_v16i64: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> %a0) - ret i64 %1 -} - -; -; vXi32 -; - -define i32 @test_v2i32(<2 x i32> %a0) { -; SSE-LABEL: test_v2i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v4i32(<4 x i32> %a0) { -; SSE-LABEL: test_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v8i32(<8 x i32> %a0) { -; SSE-LABEL: test_v8i32: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v16i32(<16 x i32> %a0) { -; SSE-LABEL: test_v16i32: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v32i32(<32 x i32> %a0) { -; SSE-LABEL: test_v32i32: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v32i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> %a0) - ret i32 %1 -} - -; -; vXi16 -; - -define i16 @test_v2i16(<2 x i16> %a0) { -; SSE-LABEL: test_v2i16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v4i16(<4 x i16> %a0) { -; SSE-LABEL: test_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v8i16(<8 x i16> %a0) { -; SSE-LABEL: test_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v16i16(<16 x i16> %a0) { -; SSE-LABEL: test_v16i16: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v32i16(<32 x i16> %a0) { -; SSE-LABEL: test_v32i16: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v32i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v64i16(<64 x i16> %a0) { -; SSE-LABEL: test_v64i16: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: por %xmm5, %xmm3 -; SSE-NEXT: por %xmm1, %xmm3 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: por %xmm3, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: por %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v64i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> %a0) - ret i16 %1 -} - -; -; vXi8 -; - -define i8 @test_v2i8(<2 x i8> %a0) { -; SSE2-LABEL: test_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v4i8(<4 x i8> %a0) { -; SSE2-LABEL: test_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v8i8(<8 x i8> %a0) { -; SSE2-LABEL: test_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v16i8(<16 x i8> %a0) { -; SSE2-LABEL: test_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v32i8(<32 x i8> %a0) { -; SSE2-LABEL: test_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v64i8(<64 x i8> %a0) { -; SSE2-LABEL: test_v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm3, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v128i8(<128 x i8> %a0) { -; SSE2-LABEL: test_v128i8: -; SSE2: # %bb.0: -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v128i8: -; SSE41: # %bb.0: -; SSE41-NEXT: por %xmm6, %xmm2 -; SSE41-NEXT: por %xmm7, %xmm3 -; SSE41-NEXT: por %xmm5, %xmm3 -; SSE41-NEXT: por %xmm1, %xmm3 -; SSE41-NEXT: por %xmm4, %xmm2 -; SSE41-NEXT: por %xmm3, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: por %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v128i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v128i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v128i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0) - ret i8 %1 -} - -declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64>) - -declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32>) - -declare i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16>) - -declare i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8>) diff --git a/test/CodeGen/X86/vector-reduce-smax-widen.ll b/test/CodeGen/X86/vector-reduce-smax-widen.ll deleted file mode 100644 index f3cbdfccc61..00000000000 --- a/test/CodeGen/X86/vector-reduce-smax-widen.ll +++ /dev/null @@ -1,2001 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL - -; -; vXi64 -; - -define i64 @test_v2i64(<2 x i64> %a0) { -; SSE2-LABEL: test_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq -; -; AVX512BW-LABEL: test_v2i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v4i64(<4 x i64> %a0) { -; SSE2-LABEL: test_v4i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v4i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v4i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v8i64(<8 x i64> %a0) { -; SSE2-LABEL: test_v8i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: xorpd %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: xorpd %xmm5, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v8i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v8i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v16i64(<16 x i64> %a0) { -; SSE2-LABEL: test_v16i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm9 -; SSE2-NEXT: por %xmm1, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm9 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm10 -; SSE41-NEXT: pxor %xmm9, %xmm10 -; SSE41-NEXT: movdqa %xmm10, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: por %xmm10, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pxor %xmm9, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm0 -; SSE41-NEXT: xorpd %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 -; SSE41-NEXT: movapd %xmm7, %xmm0 -; SSE41-NEXT: xorpd %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm0 -; SSE41-NEXT: xorpd %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm6, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] -; SSE41-NEXT: movdqa %xmm7, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm11, %xmm5, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm9 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm10 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm7, %xmm3 -; AVX1-NEXT: vblendvpd %xmm8, %xmm5, %xmm11, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v16i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> %a0) - ret i64 %1 -} - -; -; vXi32 -; - -define i32 @test_v2i32(<2 x i32> %a0) { -; SSE2-LABEL: test_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmaxsd %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v4i32(<4 x i32> %a0) { -; SSE2-LABEL: test_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pmaxsd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pmaxsd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v8i32(<8 x i32> %a0) { -; SSE2-LABEL: test_v8i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxsd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pmaxsd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pmaxsd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v16i32(<16 x i32> %a0) { -; SSE2-LABEL: test_v16i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxsd %xmm3, %xmm1 -; SSE41-NEXT: pmaxsd %xmm2, %xmm1 -; SSE41-NEXT: pmaxsd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmaxsd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmaxsd %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v32i32(<32 x i32> %a0) { -; SSE2-LABEL: test_v32i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm8 -; SSE2-NEXT: por %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxsd %xmm6, %xmm2 -; SSE41-NEXT: pmaxsd %xmm7, %xmm3 -; SSE41-NEXT: pmaxsd %xmm5, %xmm3 -; SSE41-NEXT: pmaxsd %xmm1, %xmm3 -; SSE41-NEXT: pmaxsd %xmm4, %xmm2 -; SSE41-NEXT: pmaxsd %xmm3, %xmm2 -; SSE41-NEXT: pmaxsd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pmaxsd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmaxsd %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmaxsd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> %a0) - ret i32 %1 -} - -; -; vXi16 -; - -define i16 @test_v2i16(<2 x i16> %a0) { -; SSE-LABEL: test_v2i16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pmaxsw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v4i16(<4 x i16> %a0) { -; SSE-LABEL: test_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pmaxsw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pmaxsw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v8i16(<8 x i16> %a0) { -; SSE2-LABEL: test_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: xorl $32767, %eax # imm = 0x7FFF -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vphminposuw %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v16i16(<16 x i16> %a0) { -; SSE2-LABEL: test_v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxsw %xmm1, %xmm0 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: xorl $32767, %eax # imm = 0x7FFF -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v32i16(<32 x i16> %a0) { -; SSE2-LABEL: test_v32i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pmaxsw %xmm3, %xmm1 -; SSE2-NEXT: pmaxsw %xmm2, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxsw %xmm3, %xmm1 -; SSE41-NEXT: pmaxsw %xmm2, %xmm1 -; SSE41-NEXT: pmaxsw %xmm0, %xmm1 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: xorl $32767, %eax # imm = 0x7FFF -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaxsw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v64i16(<64 x i16> %a0) { -; SSE2-LABEL: test_v64i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pmaxsw %xmm6, %xmm2 -; SSE2-NEXT: pmaxsw %xmm7, %xmm3 -; SSE2-NEXT: pmaxsw %xmm5, %xmm3 -; SSE2-NEXT: pmaxsw %xmm1, %xmm3 -; SSE2-NEXT: pmaxsw %xmm4, %xmm2 -; SSE2-NEXT: pmaxsw %xmm3, %xmm2 -; SSE2-NEXT: pmaxsw %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxsw %xmm7, %xmm3 -; SSE41-NEXT: pmaxsw %xmm5, %xmm3 -; SSE41-NEXT: pmaxsw %xmm1, %xmm3 -; SSE41-NEXT: pmaxsw %xmm6, %xmm2 -; SSE41-NEXT: pmaxsw %xmm4, %xmm2 -; SSE41-NEXT: pmaxsw %xmm3, %xmm2 -; SSE41-NEXT: pmaxsw %xmm0, %xmm2 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE41-NEXT: phminposuw %xmm2, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: xorl $32767, %eax # imm = 0x7FFF -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaxsw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmaxsw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> %a0) - ret i16 %1 -} - -; -; vXi8 -; - -define i8 @test_v2i8(<2 x i8> %a0) { -; SSE2-LABEL: test_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmaxsb %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v4i8(<4 x i8> %a0) { -; SSE2-LABEL: test_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmaxsb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pmaxsb %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v8i8(<8 x i8> %a0) { -; SSE2-LABEL: test_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmaxsb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmaxsb %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmaxsb %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v16i8(<16 x i8> %a0) { -; SSE2-LABEL: test_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: xorb $127, %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphminposuw %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: xorb $127, %al -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: xorb $127, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v32i8(<32 x i8> %a0) { -; SSE2-LABEL: test_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxsb %xmm1, %xmm0 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: xorb $127, %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: xorb $127, %al -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: xorb $127, %al -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: xorb $127, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v64i8(<64 x i8> %a0) { -; SSE2-LABEL: test_v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxsb %xmm3, %xmm1 -; SSE41-NEXT: pmaxsb %xmm2, %xmm1 -; SSE41-NEXT: pmaxsb %xmm0, %xmm1 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pminub %xmm1, %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: xorb $127, %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaxsb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: xorb $127, %al -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: xorb $127, %al -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: xorb $127, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v128i8(<128 x i8> %a0) { -; SSE2-LABEL: test_v128i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: pcmpgtb %xmm6, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm8 -; SSE2-NEXT: por %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v128i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxsb %xmm7, %xmm3 -; SSE41-NEXT: pmaxsb %xmm5, %xmm3 -; SSE41-NEXT: pmaxsb %xmm1, %xmm3 -; SSE41-NEXT: pmaxsb %xmm6, %xmm2 -; SSE41-NEXT: pmaxsb %xmm4, %xmm2 -; SSE41-NEXT: pmaxsb %xmm3, %xmm2 -; SSE41-NEXT: pmaxsb %xmm0, %xmm2 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pminub %xmm2, %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: xorb $127, %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v128i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaxsb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: xorb $127, %al -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v128i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmaxsb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxsb %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: xorb $127, %al -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v128i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: xorb $127, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> %a0) - ret i8 %1 -} - -declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>) - -declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>) - -declare i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>) - -declare i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>) diff --git a/test/CodeGen/X86/vector-reduce-smin-widen.ll b/test/CodeGen/X86/vector-reduce-smin-widen.ll deleted file mode 100644 index f1d77f3b99e..00000000000 --- a/test/CodeGen/X86/vector-reduce-smin-widen.ll +++ /dev/null @@ -1,1999 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL - -; -; vXi64 -; - -define i64 @test_v2i64(<2 x i64> %a0) { -; SSE2-LABEL: test_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq -; -; AVX512BW-LABEL: test_v2i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v4i64(<4 x i64> %a0) { -; SSE2-LABEL: test_v4i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v4i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v4i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v8i64(<8 x i64> %a0) { -; SSE2-LABEL: test_v8i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm0 -; SSE41-NEXT: xorpd %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm1 -; SSE41-NEXT: xorpd %xmm5, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v8i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v8i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v16i64(<16 x i64> %a0) { -; SSE2-LABEL: test_v16i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm9 -; SSE2-NEXT: por %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm7, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm10 -; SSE41-NEXT: pxor %xmm9, %xmm10 -; SSE41-NEXT: movdqa %xmm10, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: por %xmm10, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm0 -; SSE41-NEXT: xorpd %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 -; SSE41-NEXT: movapd %xmm4, %xmm0 -; SSE41-NEXT: xorpd %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm6, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm0 -; SSE41-NEXT: xorpd %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm9, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm11 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm11, %xmm7, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vblendvpd %xmm10, %xmm11, %xmm7, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vblendvpd %xmm8, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm2 -; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v16i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> %a0) - ret i64 %1 -} - -; -; vXi32 -; - -define i32 @test_v2i32(<2 x i32> %a0) { -; SSE2-LABEL: test_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pminsd %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v4i32(<4 x i32> %a0) { -; SSE2-LABEL: test_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pminsd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pminsd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v8i32(<8 x i32> %a0) { -; SSE2-LABEL: test_v8i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pminsd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pminsd %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v16i32(<16 x i32> %a0) { -; SSE2-LABEL: test_v16i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsd %xmm3, %xmm1 -; SSE41-NEXT: pminsd %xmm2, %xmm1 -; SSE41-NEXT: pminsd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pminsd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pminsd %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v32i32(<32 x i32> %a0) { -; SSE2-LABEL: test_v32i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm8 -; SSE2-NEXT: por %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm8 -; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsd %xmm6, %xmm2 -; SSE41-NEXT: pminsd %xmm7, %xmm3 -; SSE41-NEXT: pminsd %xmm5, %xmm3 -; SSE41-NEXT: pminsd %xmm1, %xmm3 -; SSE41-NEXT: pminsd %xmm4, %xmm2 -; SSE41-NEXT: pminsd %xmm3, %xmm2 -; SSE41-NEXT: pminsd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pminsd %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pminsd %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpminsd %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpminsd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminsd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> %a0) - ret i32 %1 -} - -; -; vXi16 -; - -define i16 @test_v2i16(<2 x i16> %a0) { -; SSE-LABEL: test_v2i16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pminsw %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v4i16(<4 x i16> %a0) { -; SSE-LABEL: test_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pminsw %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pminsw %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v8i16(<8 x i16> %a0) { -; SSE2-LABEL: test_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vphminposuw %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v16i16(<16 x i16> %a0) { -; SSE2-LABEL: test_v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsw %xmm1, %xmm0 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v32i16(<32 x i16> %a0) { -; SSE2-LABEL: test_v32i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pminsw %xmm3, %xmm1 -; SSE2-NEXT: pminsw %xmm2, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsw %xmm3, %xmm1 -; SSE41-NEXT: pminsw %xmm2, %xmm1 -; SSE41-NEXT: pminsw %xmm0, %xmm1 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpminsw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v64i16(<64 x i16> %a0) { -; SSE2-LABEL: test_v64i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pminsw %xmm6, %xmm2 -; SSE2-NEXT: pminsw %xmm7, %xmm3 -; SSE2-NEXT: pminsw %xmm5, %xmm3 -; SSE2-NEXT: pminsw %xmm1, %xmm3 -; SSE2-NEXT: pminsw %xmm4, %xmm2 -; SSE2-NEXT: pminsw %xmm3, %xmm2 -; SSE2-NEXT: pminsw %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: pminsw %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsw %xmm7, %xmm3 -; SSE41-NEXT: pminsw %xmm5, %xmm3 -; SSE41-NEXT: pminsw %xmm1, %xmm3 -; SSE41-NEXT: pminsw %xmm6, %xmm2 -; SSE41-NEXT: pminsw %xmm4, %xmm2 -; SSE41-NEXT: pminsw %xmm3, %xmm2 -; SSE41-NEXT: pminsw %xmm0, %xmm2 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE41-NEXT: phminposuw %xmm2, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminsw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminsw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminsw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000 -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> %a0) - ret i16 %1 -} - -; -; vXi8 -; - -define i8 @test_v2i8(<2 x i8> %a0) { -; SSE2-LABEL: test_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pminsb %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v4i8(<4 x i8> %a0) { -; SSE2-LABEL: test_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pminsb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pminsb %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v8i8(<8 x i8> %a0) { -; SSE2-LABEL: test_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pminsb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pminsb %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pminsb %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v16i8(<16 x i8> %a0) { -; SSE2-LABEL: test_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: xorb $-128, %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphminposuw %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: xorb $-128, %al -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: xorb $-128, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v32i8(<32 x i8> %a0) { -; SSE2-LABEL: test_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsb %xmm1, %xmm0 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: xorb $-128, %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: xorb $-128, %al -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: xorb $-128, %al -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: xorb $-128, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v64i8(<64 x i8> %a0) { -; SSE2-LABEL: test_v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsb %xmm3, %xmm1 -; SSE41-NEXT: pminsb %xmm2, %xmm1 -; SSE41-NEXT: pminsb %xmm0, %xmm1 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pminub %xmm1, %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: xorb $-128, %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpminsb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: xorb $-128, %al -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: xorb $-128, %al -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: xorb $-128, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v128i8(<128 x i8> %a0) { -; SSE2-LABEL: test_v128i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm8 -; SSE2-NEXT: por %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtb %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm8 -; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v128i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsb %xmm7, %xmm3 -; SSE41-NEXT: pminsb %xmm5, %xmm3 -; SSE41-NEXT: pminsb %xmm1, %xmm3 -; SSE41-NEXT: pminsb %xmm6, %xmm2 -; SSE41-NEXT: pminsb %xmm4, %xmm2 -; SSE41-NEXT: pminsb %xmm3, %xmm2 -; SSE41-NEXT: pminsb %xmm0, %xmm2 -; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pminub %xmm2, %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: xorb $-128, %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v128i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpminsb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminsb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminsb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: xorb $-128, %al -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v128i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminsb %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminsb %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: xorb $-128, %al -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v128i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: xorb $-128, %al -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> %a0) - ret i8 %1 -} - -declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>) - -declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>) - -declare i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>) - -declare i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>) diff --git a/test/CodeGen/X86/vector-reduce-umax-widen.ll b/test/CodeGen/X86/vector-reduce-umax-widen.ll deleted file mode 100644 index 371d85a7def..00000000000 --- a/test/CodeGen/X86/vector-reduce-umax-widen.ll +++ /dev/null @@ -1,2203 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL - -; -; vXi64 -; - -define i64 @test_v2i64(<2 x i64> %a0) { -; SSE2-LABEL: test_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq -; -; AVX512BW-LABEL: test_v2i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v4i64(<4 x i64> %a0) { -; SSE2-LABEL: test_v4i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: pxor %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v4i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v4i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v8i64(<8 x i64> %a0) { -; SSE2-LABEL: test_v8i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm0, %xmm7 -; SSE41-NEXT: pxor %xmm5, %xmm7 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm5, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm0 -; SSE41-NEXT: xorpd %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: xorpd %xmm5, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v8i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3 -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpxor %xmm2, %xmm6, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm4, %xmm4 -; AVX1-NEXT: vxorpd %xmm2, %xmm4, %xmm5 -; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3 -; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v8i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v16i64(<16 x i64> %a0) { -; SSE2-LABEL: test_v16i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm9 -; SSE2-NEXT: por %xmm1, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm9 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm10 -; SSE41-NEXT: pxor %xmm9, %xmm10 -; SSE41-NEXT: movdqa %xmm10, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: por %xmm10, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: pxor %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm8, %xmm3 -; SSE41-NEXT: pxor %xmm9, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm0 -; SSE41-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm0 -; SSE41-NEXT: xorpd %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 -; SSE41-NEXT: movapd %xmm7, %xmm0 -; SSE41-NEXT: xorpd %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm5, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 -; SSE41-NEXT: movapd %xmm7, %xmm0 -; SSE41-NEXT: xorpd %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm6, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] -; SSE41-NEXT: movdqa %xmm7, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm4, %xmm8, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10 -; AVX1-NEXT: vpxor %xmm4, %xmm10, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11 -; AVX1-NEXT: vpxor %xmm4, %xmm11, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm12 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm13 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 -; AVX1-NEXT: vblendvpd %xmm13, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm12, %xmm7, %xmm11, %xmm3 -; AVX1-NEXT: vxorpd %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vblendvpd %xmm9, %xmm10, %xmm8, %xmm6 -; AVX1-NEXT: vxorpd %xmm4, %xmm6, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm3, %xmm3 -; AVX1-NEXT: vxorpd %xmm4, %xmm3, %xmm5 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 -; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm6 -; AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm2 -; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm5, %ymm2 -; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vxorpd %ymm4, %ymm1, %ymm2 -; AVX2-NEXT: vxorpd %ymm4, %ymm0, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm2 -; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 -; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v16i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> %a0) - ret i64 %1 -} - -; -; vXi32 -; - -define i32 @test_v2i32(<2 x i32> %a0) { -; SSE2-LABEL: test_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmaxud %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v4i32(<4 x i32> %a0) { -; SSE2-LABEL: test_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pmaxud %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pmaxud %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v8i32(<8 x i32> %a0) { -; SSE2-LABEL: test_v8i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movd %xmm3, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxud %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pmaxud %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pmaxud %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v16i32(<16 x i32> %a0) { -; SSE2-LABEL: test_v16i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxud %xmm3, %xmm1 -; SSE41-NEXT: pmaxud %xmm2, %xmm1 -; SSE41-NEXT: pmaxud %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmaxud %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmaxud %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v32i32(<32 x i32> %a0) { -; SSE2-LABEL: test_v32i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm5, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: movdqa %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm9 -; SSE2-NEXT: por %xmm1, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm9 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxud %xmm6, %xmm2 -; SSE41-NEXT: pmaxud %xmm7, %xmm3 -; SSE41-NEXT: pmaxud %xmm5, %xmm3 -; SSE41-NEXT: pmaxud %xmm1, %xmm3 -; SSE41-NEXT: pmaxud %xmm4, %xmm2 -; SSE41-NEXT: pmaxud %xmm3, %xmm2 -; SSE41-NEXT: pmaxud %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pmaxud %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmaxud %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpmaxud %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmaxud %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmaxud %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxud %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> %a0) - ret i32 %1 -} - -; -; vXi16 -; - -define i16 @test_v2i16(<2 x i16> %a0) { -; SSE2-LABEL: test_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmaxuw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v4i16(<4 x i16> %a0) { -; SSE2-LABEL: test_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmaxuw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmaxuw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v8i16(<8 x i16> %a0) { -; SSE2-LABEL: test_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: notl %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphminposuw %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: notl %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512BW-LABEL: test_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notl %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notl %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v16i16(<16 x i16> %a0) { -; SSE2-LABEL: test_v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxuw %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: notl %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: notl %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notl %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v16i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notl %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notl %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v32i16(<32 x i16> %a0) { -; SSE2-LABEL: test_v32i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pmaxsw %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pmaxsw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pmaxsw %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxuw %xmm3, %xmm1 -; SSE41-NEXT: pmaxuw %xmm2, %xmm1 -; SSE41-NEXT: pmaxuw %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: notl %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: notl %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notl %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v32i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notl %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notl %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v64i16(<64 x i16> %a0) { -; SSE2-LABEL: test_v64i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pmaxsw %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pmaxsw %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pmaxsw %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: pmaxsw %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pmaxsw %xmm5, %xmm1 -; SSE2-NEXT: pmaxsw %xmm4, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pmaxsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pmaxsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxuw %xmm7, %xmm3 -; SSE41-NEXT: pmaxuw %xmm5, %xmm3 -; SSE41-NEXT: pmaxuw %xmm1, %xmm3 -; SSE41-NEXT: pmaxuw %xmm6, %xmm2 -; SSE41-NEXT: pmaxuw %xmm4, %xmm2 -; SSE41-NEXT: pmaxuw %xmm3, %xmm2 -; SSE41-NEXT: pmaxuw %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: notl %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmaxuw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxuw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaxuw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: notl %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmaxuw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxuw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notl %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v64i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vmovd %xmm0, %eax -; AVX512BW-NEXT: notl %eax -; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vmovd %xmm0, %eax -; AVX512VL-NEXT: notl %eax -; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> %a0) - ret i16 %1 -} - -; -; vXi8 -; - -define i8 @test_v2i8(<2 x i8> %a0) { -; SSE2-LABEL: test_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmaxub %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v4i8(<4 x i8> %a0) { -; SSE2-LABEL: test_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmaxub %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pmaxub %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v8i8(<8 x i8> %a0) { -; SSE2-LABEL: test_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmaxub %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmaxub %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pmaxub %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v16i8(<16 x i8> %a0) { -; SSE2-LABEL: test_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pminub %xmm1, %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: notb %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphminposuw %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: notb %al -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512BW-LABEL: test_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: notb %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512VL-NEXT: notb %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v32i8(<32 x i8> %a0) { -; SSE2-LABEL: test_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxub %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pminub %xmm1, %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: notb %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: notb %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512VL-NEXT: notb %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v64i8(<64 x i8> %a0) { -; SSE2-LABEL: test_v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pmaxub %xmm3, %xmm1 -; SSE2-NEXT: pmaxub %xmm2, %xmm1 -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxub %xmm3, %xmm1 -; SSE41-NEXT: pmaxub %xmm2, %xmm1 -; SSE41-NEXT: pmaxub %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: notb %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v64i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: notb %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v64i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512VL-NEXT: notb %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v128i8(<128 x i8> %a0) { -; SSE2-LABEL: test_v128i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pmaxub %xmm6, %xmm2 -; SSE2-NEXT: pmaxub %xmm7, %xmm3 -; SSE2-NEXT: pmaxub %xmm5, %xmm3 -; SSE2-NEXT: pmaxub %xmm1, %xmm3 -; SSE2-NEXT: pmaxub %xmm4, %xmm2 -; SSE2-NEXT: pmaxub %xmm3, %xmm2 -; SSE2-NEXT: pmaxub %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: pmaxub %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pmaxub %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pmaxub %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v128i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmaxub %xmm7, %xmm3 -; SSE41-NEXT: pmaxub %xmm5, %xmm3 -; SSE41-NEXT: pmaxub %xmm1, %xmm3 -; SSE41-NEXT: pmaxub %xmm6, %xmm2 -; SSE41-NEXT: pmaxub %xmm4, %xmm2 -; SSE41-NEXT: pmaxub %xmm3, %xmm2 -; SSE41-NEXT: pmaxub %xmm0, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: notb %al -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v128i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmaxub %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxub %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmaxub %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: notb %al -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v128i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmaxub %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpmaxub %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: notb %al -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v128i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax -; AVX512BW-NEXT: notb %al -; AVX512BW-NEXT: # kill: def $al killed $al killed $eax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v128i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax -; AVX512VL-NEXT: notb %al -; AVX512VL-NEXT: # kill: def $al killed $al killed $eax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> %a0) - ret i8 %1 -} - -declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>) - -declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>) - -declare i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>) - -declare i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>) diff --git a/test/CodeGen/X86/vector-reduce-umin-widen.ll b/test/CodeGen/X86/vector-reduce-umin-widen.ll deleted file mode 100644 index f5cc88a7d6c..00000000000 --- a/test/CodeGen/X86/vector-reduce-umin-widen.ll +++ /dev/null @@ -1,2007 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL - -; -; vXi64 -; - -define i64 @test_v2i64(<2 x i64> %a0) { -; SSE2-LABEL: test_v2i64: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2 -; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq -; -; AVX512BW-LABEL: test_v2i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v4i64(<4 x i64> %a0) { -; SSE2-LABEL: test_v4i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movq %xmm2, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: pxor %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: movq %xmm2, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3 -; AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v4i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v4i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v8i64(<8 x i64> %a0) { -; SSE2-LABEL: test_v8i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm5 -; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm5, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm0 -; SSE41-NEXT: xorpd %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm1 -; SSE41-NEXT: xorpd %xmm5, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v8i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm6 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1 -; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2 -; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3 -; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3 -; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2 -; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v8i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v8i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v16i64(<16 x i64> %a0) { -; SSE2-LABEL: test_v16i64: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm9 -; SSE2-NEXT: por %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm10 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm7, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movq %xmm3, %rax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i64: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm10 -; SSE41-NEXT: pxor %xmm9, %xmm10 -; SSE41-NEXT: movdqa %xmm10, %xmm11 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm11 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pand %xmm11, %xmm0 -; SSE41-NEXT: por %xmm10, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm7, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm2 -; SSE41-NEXT: pxor %xmm9, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm0 -; SSE41-NEXT: xorpd %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7 -; SSE41-NEXT: movapd %xmm4, %xmm0 -; SSE41-NEXT: xorpd %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm6, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm0 -; SSE41-NEXT: xorpd %xmm9, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1] -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm9, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: movq %xmm1, %rax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm8 -; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm6 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12 -; AVX1-NEXT: vpxor %xmm4, %xmm12, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13 -; AVX1-NEXT: vpxor %xmm4, %xmm13, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm10, %xmm5, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm11 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm11, %xmm6, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm7, %xmm5 -; AVX1-NEXT: vxorpd %xmm4, %xmm5, %xmm11 -; AVX1-NEXT: vblendvpd %xmm10, %xmm12, %xmm13, %xmm7 -; AVX1-NEXT: vxorpd %xmm4, %xmm7, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm11, %xmm6, %xmm6 -; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 -; AVX1-NEXT: vblendvpd %xmm8, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1 -; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm7, %xmm2 -; AVX1-NEXT: vxorpd %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2 -; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm6 -; AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm3 -; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vxorpd %ymm4, %ymm0, %ymm2 -; AVX2-NEXT: vxorpd %ymm4, %ymm1, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 -; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2 -; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3 -; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512BW-LABEL: test_v16i64: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vmovq %xmm0, %rax -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512VL-LABEL: test_v16i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0 -; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> %a0) - ret i64 %1 -} - -; -; vXi32 -; - -define i32 @test_v2i32(<2 x i32> %a0) { -; SSE2-LABEL: test_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pminud %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v4i32(<4 x i32> %a0) { -; SSE2-LABEL: test_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pminud %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pminud %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v8i32(<8 x i32> %a0) { -; SSE2-LABEL: test_v8i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movd %xmm2, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pminud %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pminud %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pminud %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v16i32(<16 x i32> %a0) { -; SSE2-LABEL: test_v16i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm6 -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm6, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: movd %xmm4, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pminud %xmm3, %xmm1 -; SSE41-NEXT: pminud %xmm2, %xmm1 -; SSE41-NEXT: pminud %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pminud %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pminud %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v32i32(<32 x i32> %a0) { -; SSE2-LABEL: test_v32i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm8, %xmm10 -; SSE2-NEXT: movdqa %xmm6, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm9 -; SSE2-NEXT: por %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm4, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm7, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm7, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm5, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm4, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm9, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm8 -; SSE2-NEXT: pand %xmm8, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm8 -; SSE2-NEXT: por %xmm3, %xmm8 -; SSE2-NEXT: movd %xmm8, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pminud %xmm6, %xmm2 -; SSE41-NEXT: pminud %xmm7, %xmm3 -; SSE41-NEXT: pminud %xmm5, %xmm3 -; SSE41-NEXT: pminud %xmm1, %xmm3 -; SSE41-NEXT: pminud %xmm4, %xmm2 -; SSE41-NEXT: pminud %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pminud %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pminud %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpminud %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminud %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> %a0) - ret i32 %1 -} - -; -; vXi16 -; - -define i16 @test_v2i16(<2 x i16> %a0) { -; SSE2-LABEL: test_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pminuw %xmm0, %xmm1 -; SSE41-NEXT: movd %xmm1, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v4i16(<4 x i16> %a0) { -; SSE2-LABEL: test_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pminuw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pminuw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v8i16(<8 x i16> %a0) { -; SSE2-LABEL: test_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vphminposuw %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v16i16(<16 x i16> %a0) { -; SSE2-LABEL: test_v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pminuw %xmm1, %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v32i16(<32 x i16> %a0) { -; SSE2-LABEL: test_v32i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pminsw %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pminsw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pminsw %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pminuw %xmm3, %xmm1 -; SSE41-NEXT: pminuw %xmm2, %xmm1 -; SSE41-NEXT: pminuw %xmm0, %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v64i16(<64 x i16> %a0) { -; SSE2-LABEL: test_v64i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: pminsw %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: pminsw %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm7 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: pminsw %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: pminsw %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pminsw %xmm5, %xmm1 -; SSE2-NEXT: pminsw %xmm4, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: pminsw %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: pminsw %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000 -; SSE2-NEXT: # kill: def $ax killed $ax killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pminuw %xmm7, %xmm3 -; SSE41-NEXT: pminuw %xmm5, %xmm3 -; SSE41-NEXT: pminuw %xmm1, %xmm3 -; SSE41-NEXT: pminuw %xmm6, %xmm2 -; SSE41-NEXT: pminuw %xmm4, %xmm2 -; SSE41-NEXT: pminuw %xmm3, %xmm2 -; SSE41-NEXT: pminuw %xmm0, %xmm2 -; SSE41-NEXT: phminposuw %xmm2, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: # kill: def $ax killed $ax killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminuw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminuw %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminuw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> %a0) - ret i16 %1 -} - -; -; vXi8 -; - -define i8 @test_v2i8(<2 x i8> %a0) { -; SSE2-LABEL: test_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v4i8(<4 x i8> %a0) { -; SSE2-LABEL: test_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pminub %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v8i8(<8 x i8> %a0) { -; SSE2-LABEL: test_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pminub %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v16i8(<16 x i8> %a0) { -; SSE2-LABEL: test_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vphminposuw %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq -; -; AVX512-LABEL: test_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v32i8(<32 x i8> %a0) { -; SSE2-LABEL: test_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pminub %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: phminposuw %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v64i8(<64 x i8> %a0) { -; SSE2-LABEL: test_v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pminub %xmm3, %xmm1 -; SSE2-NEXT: pminub %xmm2, %xmm1 -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pminub %xmm3, %xmm1 -; SSE41-NEXT: pminub %xmm2, %xmm1 -; SSE41-NEXT: pminub %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pminub %xmm1, %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpminub %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v128i8(<128 x i8> %a0) { -; SSE2-LABEL: test_v128i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pminub %xmm6, %xmm2 -; SSE2-NEXT: pminub %xmm7, %xmm3 -; SSE2-NEXT: pminub %xmm5, %xmm3 -; SSE2-NEXT: pminub %xmm1, %xmm3 -; SSE2-NEXT: pminub %xmm4, %xmm2 -; SSE2-NEXT: pminub %xmm3, %xmm2 -; SSE2-NEXT: pminub %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: pminub %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pminub %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pminub %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v128i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pminub %xmm7, %xmm3 -; SSE41-NEXT: pminub %xmm5, %xmm3 -; SSE41-NEXT: pminub %xmm1, %xmm3 -; SSE41-NEXT: pminub %xmm6, %xmm2 -; SSE41-NEXT: pminub %xmm4, %xmm2 -; SSE41-NEXT: pminub %xmm3, %xmm2 -; SSE41-NEXT: pminub %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pminub %xmm2, %xmm0 -; SSE41-NEXT: phminposuw %xmm0, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v128i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpminub %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vphminposuw %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v128i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminub %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpminub %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vphminposuw %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v128i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vphminposuw %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> %a0) - ret i8 %1 -} - -declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>) - -declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>) - -declare i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>) - -declare i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>) diff --git a/test/CodeGen/X86/vector-reduce-xor-widen.ll b/test/CodeGen/X86/vector-reduce-xor-widen.ll deleted file mode 100644 index 7bd3db37c83..00000000000 --- a/test/CodeGen/X86/vector-reduce-xor-widen.ll +++ /dev/null @@ -1,1168 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 - -; -; vXi64 -; - -define i64 @test_v2i64(<2 x i64> %a0) { -; SSE-LABEL: test_v2i64: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i64: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v4i64(<4 x i64> %a0) { -; SSE-LABEL: test_v4i64: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v8i64(<8 x i64> %a0) { -; SSE-LABEL: test_v8i64: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v8i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> %a0) - ret i64 %1 -} - -define i64 @test_v16i64(<16 x i64> %a0) { -; SSE-LABEL: test_v16i64: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> %a0) - ret i64 %1 -} - -; -; vXi32 -; - -define i32 @test_v2i32(<2 x i32> %a0) { -; SSE-LABEL: test_v2i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v4i32(<4 x i32> %a0) { -; SSE-LABEL: test_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v8i32(<8 x i32> %a0) { -; SSE-LABEL: test_v8i32: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v16i32(<16 x i32> %a0) { -; SSE-LABEL: test_v16i32: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> %a0) - ret i32 %1 -} - -define i32 @test_v32i32(<32 x i32> %a0) { -; SSE-LABEL: test_v32i32: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v32i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> %a0) - ret i32 %1 -} - -; -; vXi16 -; - -define i16 @test_v2i16(<2 x i16> %a0) { -; SSE-LABEL: test_v2i16: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v4i16(<4 x i16> %a0) { -; SSE-LABEL: test_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v8i16(<8 x i16> %a0) { -; SSE-LABEL: test_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX-LABEL: test_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: # kill: def $ax killed $ax killed $eax -; AVX-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v16i16(<16 x i16> %a0) { -; SSE-LABEL: test_v16i16: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movd %xmm1, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v32i16(<32 x i16> %a0) { -; SSE-LABEL: test_v32i16: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm1 -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v32i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> %a0) - ret i16 %1 -} - -define i16 @test_v64i16(<64 x i16> %a0) { -; SSE-LABEL: test_v64i16: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm3 -; SSE-NEXT: pxor %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm2 -; SSE-NEXT: pxor %xmm3, %xmm2 -; SSE-NEXT: pxor %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE-NEXT: pxor %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: movd %xmm0, %eax -; SSE-NEXT: # kill: def $ax killed $ax killed $eax -; SSE-NEXT: retq -; -; AVX1-LABEL: test_v64i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: # kill: def $ax killed $ax killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: # kill: def $ax killed $ax killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: # kill: def $ax killed $ax killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> %a0) - ret i16 %1 -} - -; -; vXi8 -; - -define i8 @test_v2i8(<2 x i8> %a0) { -; SSE2-LABEL: test_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v4i8(<4 x i8> %a0) { -; SSE2-LABEL: test_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v8i8(<8 x i8> %a0) { -; SSE2-LABEL: test_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v16i8(<16 x i8> %a0) { -; SSE2-LABEL: test_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX-LABEL: test_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpextrb $0, %xmm0, %eax -; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v32i8(<32 x i8> %a0) { -; SSE2-LABEL: test_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $16, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrlw $8, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pextrb $0, %xmm0, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v32i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v32i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v64i8(<64 x i8> %a0) { -; SSE2-LABEL: test_v64i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v64i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v64i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v64i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v64i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> %a0) - ret i8 %1 -} - -define i8 @test_v128i8(<128 x i8> %a0) { -; SSE2-LABEL: test_v128i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrld $16, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: # kill: def $al killed $al killed $eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: test_v128i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm6, %xmm2 -; SSE41-NEXT: pxor %xmm7, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm3 -; SSE41-NEXT: pxor %xmm4, %xmm2 -; SSE41-NEXT: pxor %xmm3, %xmm2 -; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pextrb $0, %xmm1, %eax -; SSE41-NEXT: # kill: def $al killed $al killed $eax -; SSE41-NEXT: retq -; -; AVX1-LABEL: test_v128i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpextrb $0, %xmm0, %eax -; AVX1-NEXT: # kill: def $al killed $al killed $eax -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: test_v128i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpextrb $0, %xmm0, %eax -; AVX2-NEXT: # kill: def $al killed $al killed $eax -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: test_v128i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1 -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpextrb $0, %xmm0, %eax -; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> %a0) - ret i8 %1 -} - -declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64>) -declare i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64>) - -declare i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32>) -declare i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32>) - -declare i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16>) -declare i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16>) - -declare i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8>) -declare i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8>) diff --git a/test/CodeGen/X86/vector-sext-widen.ll b/test/CodeGen/X86/vector-sext-widen.ll deleted file mode 100644 index 08e8b514c60..00000000000 --- a/test/CodeGen/X86/vector-sext-widen.ll +++ /dev/null @@ -1,3966 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW -; -; Just two 32-bit runs to make sure we do reasonable things there. -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE41 - -define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_16i8_to_8i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_16i8_to_8i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: psraw $8, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_16i8_to_8i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sext_16i8_to_8i16: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: sext_16i8_to_8i16: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: psraw $8, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_16i8_to_8i16: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> - %C = sext <8 x i8> %B to <8 x i16> - ret <8 x i16> %C -} - -define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_16i8_to_16i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_16i8_to_16i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSSE3-NEXT: psraw $8, %xmm2 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; SSSE3-NEXT: psraw $8, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_16i8_to_16i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_16i8_to_16i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_16i8_to_16i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_16i8_to_16i16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_16i8_to_16i16: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE2-NEXT: psraw $8, %xmm2 -; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] -; X32-SSE2-NEXT: psraw $8, %xmm1 -; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_16i8_to_16i16: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm1 -; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = sext <16 x i8> %A to <16 x i16> - ret <16 x i16> %B -} - -define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_32i8_to_32i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: psraw $8, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_32i8_to_32i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSSE3-NEXT: psraw $8, %xmm4 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSSE3-NEXT: psraw $8, %xmm5 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSSE3-NEXT: psraw $8, %xmm2 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSSE3-NEXT: psraw $8, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_32i8_to_32i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw %xmm0, %xmm5 -; SSE41-NEXT: pmovsxbw %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovsxbw %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_32i8_to_32i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_32i8_to_32i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sext_32i8_to_32i16: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2 -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1 -; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: sext_32i8_to_32i16: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 -; AVX512BW-NEXT: retq -; -; X32-SSE2-LABEL: sext_32i8_to_32i16: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; X32-SSE2-NEXT: psraw $8, %xmm4 -; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; X32-SSE2-NEXT: psraw $8, %xmm5 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; X32-SSE2-NEXT: psraw $8, %xmm2 -; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; X32-SSE2-NEXT: psraw $8, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm5, %xmm1 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_32i8_to_32i16: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm5 -; X32-SSE41-NEXT: pmovsxbw %xmm1, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm4 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm3 -; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 -; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 -; X32-SSE41-NEXT: retl -entry: - %B = sext <32 x i8> %A to <32 x i16> - ret <32 x i16> %B -} - -define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_16i8_to_4i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_16i8_to_4i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_16i8_to_4i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sext_16i8_to_4i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: sext_16i8_to_4i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: psrad $24, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_16i8_to_4i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> - %C = sext <4 x i8> %B to <4 x i32> - ret <4 x i32> %C -} - -define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_16i8_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psrad $24, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_16i8_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: psrad $24, %xmm2 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_16i8_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_16i8_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_16i8_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_16i8_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_16i8_to_8i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X32-SSE2-NEXT: psrad $24, %xmm2 -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_16i8_to_8i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm1 -; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> - %C = sext <8 x i8> %B to <8 x i32> - ret <8 x i32> %C -} - -define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_16i8_to_16i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSE2-NEXT: psrad $24, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psrad $24, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: psrad $24, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_16i8_to_16i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; SSSE3-NEXT: psrad $24, %xmm4 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: psrad $24, %xmm2 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: psrad $24, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_16i8_to_16i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxbd %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE41-NEXT: pmovsxbd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_16i8_to_16i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] -; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_16i8_to_16i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxbd %xmm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1 -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_16i8_to_16i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_16i8_to_16i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] -; X32-SSE2-NEXT: psrad $24, %xmm4 -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X32-SSE2-NEXT: psrad $24, %xmm2 -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE2-NEXT: psrad $24, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_16i8_to_16i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm4 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X32-SSE41-NEXT: pmovsxbd %xmm1, %xmm1 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; X32-SSE41-NEXT: pmovsxbd %xmm2, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm3 -; X32-SSE41-NEXT: movdqa %xmm4, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = sext <16 x i8> %A to <16 x i32> - ret <16 x i32> %B -} - -define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_16i8_to_2i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_16i8_to_2i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_16i8_to_2i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbq %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sext_16i8_to_2i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxbq %xmm0, %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: sext_16i8_to_2i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: pxor %xmm1, %xmm1 -; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $24, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_16i8_to_2i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> - %C = sext <2 x i8> %B to <2 x i64> - ret <2 x i64> %C -} - -define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_16i8_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_16i8_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_16i8_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbq %xmm0, %xmm2 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmovsxbq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_16i8_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_16i8_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_16i8_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_16i8_to_4i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: pxor %xmm2, %xmm2 -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_16i8_to_4i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2 -; X32-SSE41-NEXT: psrld $16, %xmm0 -; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1 -; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> - %C = sext <4 x i8> %B to <4 x i64> - ret <4 x i64> %C -} - -define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_16i8_to_8i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: psrad $24, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_16i8_to_8i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: psrad $24, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_16i8_to_8i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbq %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmovsxbq %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxbq %xmm2, %xmm2 -; SSE41-NEXT: psrlq $48, %xmm0 -; SSE41-NEXT: pmovsxbq %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_16i8_to_8i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 -; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_16i8_to_8i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxbq %xmm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1 -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_16i8_to_8i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxbq %xmm0, %zmm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_16i8_to_8i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: pxor %xmm5, %xmm5 -; X32-SSE2-NEXT: pxor %xmm2, %xmm2 -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm4 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE2-NEXT: psrad $24, %xmm3 -; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_16i8_to_8i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm4 -; X32-SSE41-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE41-NEXT: psrld $16, %xmm1 -; X32-SSE41-NEXT: pmovsxbq %xmm1, %xmm1 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; X32-SSE41-NEXT: pmovsxbq %xmm2, %xmm2 -; X32-SSE41-NEXT: psrlq $48, %xmm0 -; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm3 -; X32-SSE41-NEXT: movdqa %xmm4, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> - %C = sext <8 x i8> %B to <8 x i64> - ret <8 x i64> %C -} - -define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_8i16_to_4i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_8i16_to_4i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_8i16_to_4i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sext_8i16_to_4i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: sext_8i16_to_4i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: psrad $16, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_8i16_to_4i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> - %C = sext <4 x i16> %B to <4 x i32> - ret <4 x i32> %C -} - -define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_8i16_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_8i16_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: psrad $16, %xmm2 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_8i16_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxwd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_8i16_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_8i16_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_8i16_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_8i16_to_8i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; X32-SSE2-NEXT: psrad $16, %xmm2 -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; X32-SSE2-NEXT: psrad $16, %xmm1 -; X32-SSE2-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_8i16_to_8i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm1 -; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = sext <8 x i16> %A to <8 x i32> - ret <8 x i32> %B -} - -define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_16i16_to_16i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_16i16_to_16i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSSE3-NEXT: psrad $16, %xmm4 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSSE3-NEXT: psrad $16, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: psrad $16, %xmm2 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; SSSE3-NEXT: psrad $16, %xmm3 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_16i16_to_16i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxwd %xmm0, %xmm5 -; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovsxwd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_16i16_to_16i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_16i16_to_16i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm1 -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_16i16_to_16i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_16i16_to_16i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; X32-SSE2-NEXT: psrad $16, %xmm4 -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; X32-SSE2-NEXT: psrad $16, %xmm5 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; X32-SSE2-NEXT: psrad $16, %xmm2 -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] -; X32-SSE2-NEXT: psrad $16, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm5, %xmm1 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_16i16_to_16i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm5 -; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm4 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm3 -; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 -; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 -; X32-SSE41-NEXT: retl -entry: - %B = sext <16 x i16> %A to <16 x i32> - ret <16 x i32> %B -} - -define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_8i16_to_2i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_8i16_to_2i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_8i16_to_2i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxwq %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sext_8i16_to_2i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxwq %xmm0, %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: sext_8i16_to_2i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: pxor %xmm1, %xmm1 -; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $16, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_8i16_to_2i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> - %C = sext <2 x i16> %B to <2 x i64> - ret <2 x i64> %C -} - -define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_8i16_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_8i16_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_8i16_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxwq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_8i16_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_8i16_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_8i16_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxwq %xmm0, %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_8i16_to_4i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: psrad $16, %xmm1 -; X32-SSE2-NEXT: pxor %xmm2, %xmm2 -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_8i16_to_4i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm1 -; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> - %C = sext <4 x i16> %B to <4 x i64> - ret <4 x i64> %C -} - -define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_8i16_to_8i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_8i16_to_8i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSSE3-NEXT: psrad $16, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_8i16_to_8i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxwq %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm1, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwq %xmm2, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_8i16_to_8i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] -; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_8i16_to_8i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxwq %xmm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovsxwq %xmm0, %ymm1 -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_8i16_to_8i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_8i16_to_8i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: psrad $16, %xmm1 -; X32-SSE2-NEXT: pxor %xmm5, %xmm5 -; X32-SSE2-NEXT: pxor %xmm2, %xmm2 -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm4 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; X32-SSE2-NEXT: psrad $16, %xmm3 -; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] -; X32-SSE2-NEXT: movdqa %xmm4, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_8i16_to_8i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm4 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; X32-SSE41-NEXT: pmovsxwq %xmm1, %xmm1 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; X32-SSE41-NEXT: pmovsxwq %xmm2, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm3 -; X32-SSE41-NEXT: movdqa %xmm4, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = sext <8 x i16> %A to <8 x i64> - ret <8 x i64> %B -} - -define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_4i32_to_2i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_4i32_to_2i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_4i32_to_2i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxdq %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sext_4i32_to_2i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: sext_4i32_to_2i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: pxor %xmm1, %xmm1 -; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_4i32_to_2i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> - %C = sext <2 x i32> %B to <2 x i64> - ret <2 x i64> %C -} - -define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_4i32_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_4i32_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_4i32_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_4i32_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_4i32_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_4i32_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_4i32_to_4i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: pxor %xmm2, %xmm2 -; X32-SSE2-NEXT: pxor %xmm3, %xmm3 -; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_4i32_to_4i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 -; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %B = sext <4 x i32> %A to <4 x i64> - ret <4 x i64> %B -} - -define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_8i32_to_8i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_8i32_to_8i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm5, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_8i32_to_8i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxdq %xmm0, %xmm5 -; SSE41-NEXT: pmovsxdq %xmm1, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_8i32_to_8i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_8i32_to_8i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1 -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_8i32_to_8i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_8i32_to_8i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: pxor %xmm4, %xmm4 -; X32-SSE2-NEXT: pxor %xmm3, %xmm3 -; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X32-SSE2-NEXT: pxor %xmm5, %xmm5 -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm5 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE2-NEXT: pxor %xmm3, %xmm3 -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] -; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_8i32_to_8i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm5 -; X32-SSE41-NEXT: pmovsxdq %xmm1, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm4 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm3 -; X32-SSE41-NEXT: movdqa %xmm5, %xmm0 -; X32-SSE41-NEXT: movdqa %xmm4, %xmm1 -; X32-SSE41-NEXT: retl -entry: - %B = sext <8 x i32> %A to <8 x i64> - ret <8 x i64> %B -} - -define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) { -; SSE-LABEL: load_sext_2i1_to_2i64: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movzbl (%rdi), %eax -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shlq $62, %rcx -; SSE-NEXT: movq %rcx, %xmm0 -; SSE-NEXT: shlq $63, %rax -; SSE-NEXT: movq %rax, %xmm1 -; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE-NEXT: retq -; -; AVX1-LABEL: load_sext_2i1_to_2i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: movzbl (%rdi), %eax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $62, %rcx -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: shlq $63, %rax -; AVX1-NEXT: vmovq %rax, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_2i1_to_2i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movzbl (%rdi), %eax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $62, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: shlq $63, %rax -; AVX2-NEXT: vmovq %rax, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_2i1_to_2i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_2i1_to_2i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movzbl (%eax), %eax -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $30, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; X32-SSE2-NEXT: shll $31, %eax -; X32-SSE2-NEXT: movd %eax, %xmm0 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-SSE2-NEXT: psrad $31, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_2i1_to_2i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movzbl (%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $31, %ecx -; X32-SSE41-NEXT: movd %ecx, %xmm0 -; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 -; X32-SSE41-NEXT: shll $30, %eax -; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 -; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 -; X32-SSE41-NEXT: psrad $31, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %X = load <2 x i1>, <2 x i1>* %ptr - %Y = sext <2 x i1> %X to <2 x i64> - ret <2 x i64> %Y -} - -define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) { -; SSE2-LABEL: load_sext_2i8_to_2i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movzwl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_2i8_to_2i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movzwl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_2i8_to_2i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: load_sext_2i8_to_2i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxbq (%rdi), %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_2i8_to_2i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movzwl (%eax), %eax -; X32-SSE2-NEXT: movd %eax, %xmm0 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: pxor %xmm1, %xmm1 -; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $24, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_2i8_to_2i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 -; X32-SSE41-NEXT: retl -entry: - %X = load <2 x i8>, <2 x i8>* %ptr - %Y = sext <2 x i8> %X to <2 x i64> - ret <2 x i64> %Y -} - -define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) { -; SSE2-LABEL: load_sext_4i1_to_4i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movl (%rdi), %eax -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $60, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $61, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $62, %rcx -; SSE2-NEXT: sarq $63, %rcx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: shlq $63, %rax -; SSE2-NEXT: sarq $63, %rax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_4i1_to_4i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movl (%rdi), %eax -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $60, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $61, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $62, %rcx -; SSSE3-NEXT: sarq $63, %rcx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: shlq $63, %rax -; SSSE3-NEXT: sarq $63, %rax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_4i1_to_4i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movl (%rdi), %eax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $62, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: movq %rax, %rdx -; SSE41-NEXT: shlq $63, %rdx -; SSE41-NEXT: sarq $63, %rdx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $61, %rcx -; SSE41-NEXT: sarq $63, %rcx -; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 -; SSE41-NEXT: shlq $60, %rax -; SSE41-NEXT: sarq $63, %rax -; SSE41-NEXT: pinsrd $3, %eax, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_4i1_to_4i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: movl (%rdi), %eax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $62, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $63, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $61, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: shlq $60, %rax -; AVX1-NEXT: sarq $63, %rax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_4i1_to_4i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movl (%rdi), %eax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $62, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: movq %rax, %rdx -; AVX2-NEXT: shlq $63, %rdx -; AVX2-NEXT: sarq $63, %rdx -; AVX2-NEXT: vmovd %edx, %xmm0 -; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $61, %rcx -; AVX2-NEXT: sarq $63, %rcx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: shlq $60, %rax -; AVX2-NEXT: sarq $63, %rax -; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_4i1_to_4i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_4i1_to_4i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl (%eax), %eax -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $28, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $29, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shll $30, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm2 -; X32-SSE2-NEXT: shll $31, %eax -; X32-SSE2-NEXT: movd %eax, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-SSE2-NEXT: psrad $31, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_4i1_to_4i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movl (%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $30, %ecx -; X32-SSE41-NEXT: movl %eax, %edx -; X32-SSE41-NEXT: shll $31, %edx -; X32-SSE41-NEXT: movd %edx, %xmm0 -; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shll $29, %ecx -; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm0 -; X32-SSE41-NEXT: shll $28, %eax -; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 -; X32-SSE41-NEXT: psrad $31, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %X = load <4 x i1>, <4 x i1>* %ptr - %Y = sext <4 x i1> %X to <4 x i32> - ret <4 x i32> %Y -} - -define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) { -; SSE2-LABEL: load_sext_4i8_to_4i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_4i8_to_4i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_4i8_to_4i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: load_sext_4i8_to_4i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxbd (%rdi), %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_4i8_to_4i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: psrad $24, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_4i8_to_4i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0 -; X32-SSE41-NEXT: retl -entry: - %X = load <4 x i8>, <4 x i8>* %ptr - %Y = sext <4 x i8> %X to <4 x i32> - ret <4 x i32> %Y -} - -define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) { -; SSE2-LABEL: load_sext_4i1_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movl (%rdi), %eax -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $3, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: movl %eax, %ecx -; SSE2-NEXT: shrl $2, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: shrl %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] -; SSE2-NEXT: psllq $63, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_4i1_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movl (%rdi), %eax -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $3, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: movl %eax, %ecx -; SSSE3-NEXT: shrl $2, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: shrl %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] -; SSSE3-NEXT: psllq $63, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] -; SSSE3-NEXT: psllq $63, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_4i1_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movl (%rdi), %eax -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl %ecx -; SSE41-NEXT: movd %eax, %xmm1 -; SSE41-NEXT: pinsrd $1, %ecx, %xmm1 -; SSE41-NEXT: movl %eax, %ecx -; SSE41-NEXT: shrl $2, %ecx -; SSE41-NEXT: pinsrd $2, %ecx, %xmm1 -; SSE41-NEXT: shrl $3, %eax -; SSE41-NEXT: pinsrd $3, %eax, %xmm1 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm1 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: psllq $63, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE41-NEXT: psllq $63, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_4i1_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: movl (%rdi), %eax -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $62, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: movq %rax, %rdx -; AVX1-NEXT: shlq $63, %rdx -; AVX1-NEXT: sarq $63, %rdx -; AVX1-NEXT: vmovd %edx, %xmm0 -; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: movq %rax, %rcx -; AVX1-NEXT: shlq $61, %rcx -; AVX1-NEXT: sarq $63, %rcx -; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: shlq $60, %rax -; AVX1-NEXT: sarq $63, %rax -; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_4i1_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movl (%rdi), %eax -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $60, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm0 -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $61, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX2-NEXT: movq %rax, %rcx -; AVX2-NEXT: shlq $62, %rcx -; AVX2-NEXT: vmovq %rcx, %xmm1 -; AVX2-NEXT: shlq $63, %rax -; AVX2-NEXT: vmovq %rax, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_4i1_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_4i1_to_4i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movzbl (%eax), %eax -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $3, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: movl %eax, %ecx -; X32-SSE2-NEXT: shrl $2, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm1 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE2-NEXT: movd %eax, %xmm2 -; X32-SSE2-NEXT: shrl %eax -; X32-SSE2-NEXT: movd %eax, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm2 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3] -; X32-SSE2-NEXT: psllq $63, %xmm0 -; X32-SSE2-NEXT: psrad $31, %xmm0 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3] -; X32-SSE2-NEXT: psllq $63, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_4i1_to_4i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movzbl (%eax), %eax -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl %ecx -; X32-SSE41-NEXT: movd %eax, %xmm1 -; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm1 -; X32-SSE41-NEXT: movl %eax, %ecx -; X32-SSE41-NEXT: shrl $2, %ecx -; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm1 -; X32-SSE41-NEXT: shrl $3, %eax -; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1 -; X32-SSE41-NEXT: pand {{\.LCPI.*}}, %xmm1 -; X32-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero -; X32-SSE41-NEXT: psllq $63, %xmm0 -; X32-SSE41-NEXT: psrad $31, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; X32-SSE41-NEXT: psllq $63, %xmm1 -; X32-SSE41-NEXT: psrad $31, %xmm1 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X32-SSE41-NEXT: retl -entry: - %X = load <4 x i1>, <4 x i1>* %ptr - %Y = sext <4 x i1> %X to <4 x i64> - ret <4 x i64> %Y -} - -define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { -; SSE2-LABEL: load_sext_4i8_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_4i8_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_4i8_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 -; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_4i8_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxbq (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_4i8_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_4i8_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_4i8_to_4i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: pxor %xmm2, %xmm2 -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_4i8_to_4i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 -; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1 -; X32-SSE41-NEXT: retl -entry: - %X = load <4 x i8>, <4 x i8>* %ptr - %Y = sext <4 x i8> %X to <4 x i64> - ret <4 x i64> %Y -} - -define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) { -; SSE2-LABEL: load_sext_4i8_to_4i64_extract: -; SSE2: # %bb.0: -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_4i8_to_4i64_extract: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_4i8_to_4i64_extract: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_4i8_to_4i64_extract: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_4i8_to_4i64_extract: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_4i8_to_4i64_extract: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_4i8_to_4i64_extract: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: psrad $24, %xmm0 -; X32-SSE2-NEXT: pxor %xmm1, %xmm1 -; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_4i8_to_4i64_extract: -; X32-SSE41: # %bb.0: -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm0 -; X32-SSE41-NEXT: retl - %ld = load <4 x i8>, <4 x i8>* %ptr - %sext = sext <4 x i8> %ld to <4 x i64> - %extract = shufflevector <4 x i64> %sext, <4 x i64> undef, <2 x i32> - ret <2 x i64> %extract -} - -define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) { -; SSE-LABEL: load_sext_8i1_to_8i16: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; SSE-NEXT: pand %xmm1, %xmm0 -; SSE-NEXT: pcmpeqw %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: load_sext_8i1_to_8i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_8i1_to_8i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: load_sext_8i1_to_8i16: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: kmovw (%rdi), %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: load_sext_8i1_to_8i16: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: kmovw (%rdi), %k0 -; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; X32-SSE-LABEL: load_sext_8i1_to_8i16: -; X32-SSE: # %bb.0: # %entry -; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: pcmpeqw %xmm1, %xmm0 -; X32-SSE-NEXT: retl -entry: - %X = load <8 x i1>, <8 x i1>* %ptr - %Y = sext <8 x i1> %X to <8 x i16> - ret <8 x i16> %Y -} - -define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) { -; SSE2-LABEL: load_sext_8i8_to_8i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_8i8_to_8i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: psraw $8, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_8i8_to_8i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: load_sext_8i8_to_8i16: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxbw (%rdi), %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_8i8_to_8i16: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: psraw $8, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_8i8_to_8i16: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0 -; X32-SSE41-NEXT: retl -entry: - %X = load <8 x i8>, <8 x i8>* %ptr - %Y = sext <8 x i8> %X to <8 x i16> - ret <8 x i16> %Y -} - -define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) { -; SSE2-LABEL: load_sext_8i8_to_8i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: psrad $24, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_8i8_to_8i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: psrad $24, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_8i8_to_8i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 -; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1 -; SSE41-NEXT: pmovsxbq 4(%rdi), %xmm2 -; SSE41-NEXT: pmovsxbq 6(%rdi), %xmm3 -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_8i8_to_8i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbq 6(%rdi), %xmm1 -; AVX1-NEXT: vpmovsxbq 4(%rdi), %xmm2 -; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxbq (%rdi), %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_8i8_to_8i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0 -; AVX2-NEXT: vpmovsxbq 4(%rdi), %ymm1 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_8i8_to_8i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxbq (%rdi), %zmm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_8i8_to_8i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: pxor %xmm4, %xmm4 -; X32-SSE2-NEXT: pxor %xmm3, %xmm3 -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; X32-SSE2-NEXT: psrad $24, %xmm3 -; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_8i8_to_8i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 -; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1 -; X32-SSE41-NEXT: pmovsxbq 4(%eax), %xmm2 -; X32-SSE41-NEXT: pmovsxbq 6(%eax), %xmm3 -; X32-SSE41-NEXT: retl -entry: - %X = load <8 x i8>, <8 x i8>* %ptr - %Y = sext <8 x i8> %X to <8 x i64> - ret <8 x i64> %Y -} - -define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) { -; SSE-LABEL: load_sext_8i1_to_8i32: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: load_sext_8i1_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_8i1_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_8i1_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: kmovw (%rdi), %k1 -; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512-NEXT: retq -; -; X32-SSE-LABEL: load_sext_8i1_to_8i32: -; X32-SSE: # %bb.0: # %entry -; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8] -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128] -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm1 -; X32-SSE-NEXT: retl -entry: - %X = load <8 x i1>, <8 x i1>* %ptr - %Y = sext <8 x i1> %X to <8 x i32> - ret <8 x i32> %Y -} - -define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) { -; SSE2-LABEL: load_sext_8i8_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_8i8_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_8i8_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 -; SSE41-NEXT: pmovsxbd 4(%rdi), %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_8i8_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxbd (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_8i8_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_8i8_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_8i8_to_8i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X32-SSE2-NEXT: psrad $24, %xmm0 -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_8i8_to_8i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0 -; X32-SSE41-NEXT: pmovsxbd 4(%eax), %xmm1 -; X32-SSE41-NEXT: retl -entry: - %X = load <8 x i8>, <8 x i8>* %ptr - %Y = sext <8 x i8> %X to <8 x i32> - ret <8 x i32> %Y -} - -define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone { -; SSE2-LABEL: load_sext_16i1_to_16i8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_16i1_to_16i8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_16i1_to_16i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqb %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_16i1_to_16i8: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307] -; AVX1-NEXT: # xmm1 = mem[0,0] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_16i1_to_16i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: load_sext_16i1_to_16i8: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: kmovw (%rdi), %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: load_sext_16i1_to_16i8: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: kmovw (%rdi), %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_16i1_to_16i8: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; X32-SSE2-NEXT: pand %xmm1, %xmm0 -; X32-SSE2-NEXT: pcmpeqb %xmm1, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_16i1_to_16i8: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1] -; X32-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; X32-SSE41-NEXT: pand %xmm1, %xmm0 -; X32-SSE41-NEXT: pcmpeqb %xmm1, %xmm0 -; X32-SSE41-NEXT: retl -entry: - %X = load <16 x i1>, <16 x i1>* %ptr - %Y = sext <16 x i1> %X to <16 x i8> - ret <16 x i8> %Y -} - -define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) { -; SSE-LABEL: load_sext_16i1_to_16i16: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pcmpeqw %xmm2, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: load_sext_16i1_to_16i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_16i1_to_16i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: load_sext_16i1_to_16i16: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: kmovw (%rdi), %k1 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: load_sext_16i1_to_16i16: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: kmovw (%rdi), %k0 -; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512BW-NEXT: retq -; -; X32-SSE-LABEL: load_sext_16i1_to_16i16: -; X32-SSE: # %bb.0: # %entry -; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768] -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm1 -; X32-SSE-NEXT: retl -entry: - %X = load <16 x i1>, <16 x i1>* %ptr - %Y = sext <16 x i1> %X to <16 x i16> - ret <16 x i16> %Y -} - -define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone { -; SSE-LABEL: load_sext_32i1_to_32i8: -; SSE: # %bb.0: # %entry -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pcmpeqb %xmm2, %xmm0 -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: load_sext_32i1_to_32i8: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_32i1_to_32i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: load_sext_32i1_to_32i8: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: kmovw (%rdi), %k1 -; AVX512F-NEXT: kmovw 2(%rdi), %k2 -; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: load_sext_32i1_to_32i8: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: kmovd (%rdi), %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512BW-NEXT: retq -; -; X32-SSE-LABEL: load_sext_32i1_to_32i8: -; X32-SSE: # %bb.0: # %entry -; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128] -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1] -; X32-SSE-NEXT: pand %xmm2, %xmm1 -; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; X32-SSE-NEXT: retl -entry: - %X = load <32 x i1>, <32 x i1>* %ptr - %Y = sext <32 x i1> %X to <32 x i8> - ret <32 x i8> %Y -} - -define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) { -; SSE2-LABEL: load_sext_16i8_to_16i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_16i8_to_16i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: psraw $8, %xmm0 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: psraw $8, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_16i8_to_16i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 -; SSE41-NEXT: pmovsxbw 8(%rdi), %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_16i8_to_16i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxbw (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_16i8_to_16i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_16i8_to_16i16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxbw (%rdi), %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_16i8_to_16i16: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movdqa (%eax), %xmm1 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X32-SSE2-NEXT: psraw $8, %xmm0 -; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X32-SSE2-NEXT: psraw $8, %xmm1 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_16i8_to_16i16: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0 -; X32-SSE41-NEXT: pmovsxbw 8(%eax), %xmm1 -; X32-SSE41-NEXT: retl -entry: - %X = load <16 x i8>, <16 x i8>* %ptr - %Y = sext <16 x i8> %X to <16 x i16> - ret <16 x i16> %Y -} - -define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) { -; SSE2-LABEL: load_sext_2i16_to_2i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_2i16_to_2i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_2i16_to_2i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: load_sext_2i16_to_2i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxwq (%rdi), %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_2i16_to_2i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; X32-SSE2-NEXT: pxor %xmm1, %xmm1 -; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $16, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_2i16_to_2i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0 -; X32-SSE41-NEXT: retl -entry: - %X = load <2 x i16>, <2 x i16>* %ptr - %Y = sext <2 x i16> %X to <2 x i64> - ret <2 x i64> %Y -} - -define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) { -; SSE2-LABEL: load_sext_4i16_to_4i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_4i16_to_4i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_4i16_to_4i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: load_sext_4i16_to_4i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxwd (%rdi), %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_4i16_to_4i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: psrad $16, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_4i16_to_4i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0 -; X32-SSE41-NEXT: retl -entry: - %X = load <4 x i16>, <4 x i16>* %ptr - %Y = sext <4 x i16> %X to <4 x i32> - ret <4 x i32> %Y -} - -define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { -; SSE2-LABEL: load_sext_4i16_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_4i16_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_4i16_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 -; SSE41-NEXT: pmovsxwq 4(%rdi), %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_4i16_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxwq 4(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxwq (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_4i16_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_4i16_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxwq (%rdi), %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_4i16_to_4i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: psrad $16, %xmm1 -; X32-SSE2-NEXT: pxor %xmm2, %xmm2 -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_4i16_to_4i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0 -; X32-SSE41-NEXT: pmovsxwq 4(%eax), %xmm1 -; X32-SSE41-NEXT: retl -entry: - %X = load <4 x i16>, <4 x i16>* %ptr - %Y = sext <4 x i16> %X to <4 x i64> - ret <4 x i64> %Y -} - -define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) { -; SSE2-LABEL: load_sext_8i16_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_8i16_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_8i16_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 -; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_8i16_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxwd (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_8i16_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_8i16_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_8i16_to_8i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movdqa (%eax), %xmm1 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X32-SSE2-NEXT: psrad $16, %xmm0 -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: psrad $16, %xmm1 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_8i16_to_8i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0 -; X32-SSE41-NEXT: pmovsxwd 8(%eax), %xmm1 -; X32-SSE41-NEXT: retl -entry: - %X = load <8 x i16>, <8 x i16>* %ptr - %Y = sext <8 x i16> %X to <8 x i32> - ret <8 x i32> %Y -} - -define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) { -; SSE2-LABEL: load_sext_2i32_to_2i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_2i32_to_2i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_2i32_to_2i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxdq (%rdi), %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: load_sext_2i32_to_2i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxdq (%rdi), %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_2i32_to_2i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X32-SSE2-NEXT: pxor %xmm1, %xmm1 -; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_2i32_to_2i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0 -; X32-SSE41-NEXT: retl -entry: - %X = load <2 x i32>, <2 x i32>* %ptr - %Y = sext <2 x i32> %X to <2 x i64> - ret <2 x i64> %Y -} - -define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) { -; SSE2-LABEL: load_sext_4i32_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_sext_4i32_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa (%rdi), %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_sext_4i32_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxdq (%rdi), %xmm0 -; SSE41-NEXT: pmovsxdq 8(%rdi), %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_sext_4i32_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovsxdq 8(%rdi), %xmm0 -; AVX1-NEXT: vpmovsxdq (%rdi), %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_sext_4i32_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovsxdq (%rdi), %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_sext_4i32_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovsxdq (%rdi), %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: load_sext_4i32_to_4i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movdqa (%eax), %xmm0 -; X32-SSE2-NEXT: pxor %xmm2, %xmm2 -; X32-SSE2-NEXT: pxor %xmm3, %xmm3 -; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: load_sext_4i32_to_4i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0 -; X32-SSE41-NEXT: pmovsxdq 8(%eax), %xmm1 -; X32-SSE41-NEXT: retl -entry: - %X = load <4 x i32>, <4 x i32>* %ptr - %Y = sext <4 x i32> %X to <4 x i64> - ret <4 x i64> %Y -} - -define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_2i8_to_i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_2i8_to_i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: psraw $8, %xmm0 -; SSSE3-NEXT: movd %xmm0, %eax -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_2i8_to_i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, %eax -; SSE41-NEXT: retq -; -; AVX-LABEL: sext_2i8_to_i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: sext_2i8_to_i32: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: psraw $8, %xmm0 -; X32-SSE2-NEXT: movd %xmm0, %eax -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_2i8_to_i32: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0 -; X32-SSE41-NEXT: movd %xmm0, %eax -; X32-SSE41-NEXT: retl -entry: - %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> - %Ex = sext <2 x i8> %Shuf to <2 x i16> - %Bc = bitcast <2 x i16> %Ex to i32 - ret i32 %Bc -} - -define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) { -; SSE2-LABEL: sext_4i1_to_4i64: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_4i1_to_4i64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pxor %xmm3, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_4i1_to_4i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxdq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_4i1_to_4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_4i1_to_4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_4i1_to_4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_4i1_to_4i64: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: pslld $31, %xmm0 -; X32-SSE2-NEXT: psrad $31, %xmm0 -; X32-SSE2-NEXT: pxor %xmm2, %xmm2 -; X32-SSE2-NEXT: pxor %xmm3, %xmm3 -; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_4i1_to_4i64: -; X32-SSE41: # %bb.0: -; X32-SSE41-NEXT: pslld $31, %xmm0 -; X32-SSE41-NEXT: psrad $31, %xmm0 -; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1 -; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE41-NEXT: retl - %extmask = sext <4 x i1> %mask to <4 x i64> - ret <4 x i64> %extmask -} - -define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) { -; SSE2-LABEL: sext_4i8_to_4i64: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: psrad $24, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_4i8_to_4i64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_4i8_to_4i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovsxbq %xmm0, %xmm2 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmovsxbq %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_4i8_to_4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_4i8_to_4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_4i8_to_4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_4i8_to_4i64: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE2-NEXT: psrad $24, %xmm1 -; X32-SSE2-NEXT: pxor %xmm2, %xmm2 -; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_4i8_to_4i64: -; X32-SSE41: # %bb.0: -; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2 -; X32-SSE41-NEXT: psrld $16, %xmm0 -; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1 -; X32-SSE41-NEXT: movdqa %xmm2, %xmm0 -; X32-SSE41-NEXT: retl - %extmask = sext <4 x i8> %mask to <4 x i64> - ret <4 x i64> %extmask -} - -define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind { -; SSE-LABEL: sext_32xi1_to_32xi8: -; SSE: # %bb.0: -; SSE-NEXT: pcmpeqw %xmm5, %xmm1 -; SSE-NEXT: pcmpeqw %xmm4, %xmm0 -; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: pcmpeqw %xmm7, %xmm3 -; SSE-NEXT: pcmpeqw %xmm6, %xmm2 -; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: sext_32xi1_to_32xi8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpcmpeqw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_32xi1_to_32xi8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: sext_32xi1_to_32xi8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: sext_32xi1_to_32xi8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0 -; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512BW-NEXT: retq -; -; X32-SSE-LABEL: sext_32xi1_to_32xi8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pushl %ebp -; X32-SSE-NEXT: movl %esp, %ebp -; X32-SSE-NEXT: andl $-16, %esp -; X32-SSE-NEXT: subl $16, %esp -; X32-SSE-NEXT: movdqa 8(%ebp), %xmm3 -; X32-SSE-NEXT: pcmpeqw 40(%ebp), %xmm1 -; X32-SSE-NEXT: pcmpeqw 24(%ebp), %xmm0 -; X32-SSE-NEXT: packsswb %xmm1, %xmm0 -; X32-SSE-NEXT: pcmpeqw 72(%ebp), %xmm3 -; X32-SSE-NEXT: pcmpeqw 56(%ebp), %xmm2 -; X32-SSE-NEXT: packsswb %xmm3, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE-NEXT: movl %ebp, %esp -; X32-SSE-NEXT: popl %ebp -; X32-SSE-NEXT: retl - %a = icmp eq <32 x i16> %c1, %c2 - %b = sext <32 x i1> %a to <32 x i8> - ret <32 x i8> %b -} - -define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) { -; SSE2-LABEL: sext_2i8_to_2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movzwl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: psrad $24, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_2i8_to_2i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movzwl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: psrad $24, %xmm0 -; SSSE3-NEXT: paddd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_2i8_to_2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movzwl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; SSE41-NEXT: paddd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sext_2i8_to_2i32: -; AVX: # %bb.0: -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpmovsxbd %xmm0, %xmm0 -; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: sext_2i8_to_2i32: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movzwl (%eax), %eax -; X32-SSE2-NEXT: movd %eax, %xmm0 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; X32-SSE2-NEXT: psrad $24, %xmm0 -; X32-SSE2-NEXT: paddd %xmm0, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_2i8_to_2i32: -; X32-SSE41: # %bb.0: -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movzwl (%eax), %eax -; X32-SSE41-NEXT: movd %eax, %xmm0 -; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0 -; X32-SSE41-NEXT: paddd %xmm0, %xmm0 -; X32-SSE41-NEXT: retl - %x = load <2 x i8>, <2 x i8>* %addr, align 1 - %y = sext <2 x i8> %x to <2 x i32> - %z = add <2 x i32>%y, %y - ret <2 x i32>%z -} - -define <4 x i32> @sext_4i17_to_4i32(<4 x i17>* %ptr) { -; SSE2-LABEL: sext_4i17_to_4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $30, %rcx -; SSE2-NEXT: sarq $47, %rcx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shlq $47, %rcx -; SSE2-NEXT: sarq $47, %rcx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movl 8(%rdi), %ecx -; SSE2-NEXT: shll $13, %ecx -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: shrq $51, %rdx -; SSE2-NEXT: orl %ecx, %edx -; SSE2-NEXT: shlq $47, %rdx -; SSE2-NEXT: sarq $47, %rdx -; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: shlq $13, %rax -; SSE2-NEXT: sarq $47, %rax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_4i17_to_4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq (%rdi), %rax -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $30, %rcx -; SSSE3-NEXT: sarq $47, %rcx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shlq $47, %rcx -; SSSE3-NEXT: sarq $47, %rcx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movl 8(%rdi), %ecx -; SSSE3-NEXT: shll $13, %ecx -; SSSE3-NEXT: movq %rax, %rdx -; SSSE3-NEXT: shrq $51, %rdx -; SSSE3-NEXT: orl %ecx, %edx -; SSSE3-NEXT: shlq $47, %rdx -; SSSE3-NEXT: sarq $47, %rdx -; SSSE3-NEXT: movd %edx, %xmm1 -; SSSE3-NEXT: shlq $13, %rax -; SSSE3-NEXT: sarq $47, %rax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_4i17_to_4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movq (%rdi), %rax -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $30, %rcx -; SSE41-NEXT: sarq $47, %rcx -; SSE41-NEXT: movq %rax, %rdx -; SSE41-NEXT: shlq $47, %rdx -; SSE41-NEXT: sarq $47, %rdx -; SSE41-NEXT: movd %edx, %xmm0 -; SSE41-NEXT: pinsrd $1, %ecx, %xmm0 -; SSE41-NEXT: movq %rax, %rcx -; SSE41-NEXT: shlq $13, %rcx -; SSE41-NEXT: sarq $47, %rcx -; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 -; SSE41-NEXT: movl 8(%rdi), %ecx -; SSE41-NEXT: shll $13, %ecx -; SSE41-NEXT: shrq $51, %rax -; SSE41-NEXT: orl %ecx, %eax -; SSE41-NEXT: shlq $47, %rax -; SSE41-NEXT: sarq $47, %rax -; SSE41-NEXT: pinsrd $3, %eax, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: sext_4i17_to_4i32: -; AVX: # %bb.0: -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shlq $30, %rcx -; AVX-NEXT: sarq $47, %rcx -; AVX-NEXT: movq %rax, %rdx -; AVX-NEXT: shlq $47, %rdx -; AVX-NEXT: sarq $47, %rdx -; AVX-NEXT: vmovd %edx, %xmm0 -; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movq %rax, %rcx -; AVX-NEXT: shlq $13, %rcx -; AVX-NEXT: sarq $47, %rcx -; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX-NEXT: movl 8(%rdi), %ecx -; AVX-NEXT: shll $13, %ecx -; AVX-NEXT: shrq $51, %rax -; AVX-NEXT: orl %ecx, %eax -; AVX-NEXT: shlq $47, %rax -; AVX-NEXT: sarq $47, %rax -; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; X32-SSE2-LABEL: sext_4i17_to_4i32: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: movl (%eax), %ecx -; X32-SSE2-NEXT: movl 4(%eax), %edx -; X32-SSE2-NEXT: movl 8(%eax), %eax -; X32-SSE2-NEXT: shldl $13, %edx, %eax -; X32-SSE2-NEXT: shll $15, %eax -; X32-SSE2-NEXT: movd %eax, %xmm0 -; X32-SSE2-NEXT: movl %edx, %eax -; X32-SSE2-NEXT: shll $13, %eax -; X32-SSE2-NEXT: movd %eax, %xmm1 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE2-NEXT: shldl $15, %ecx, %edx -; X32-SSE2-NEXT: shll $15, %ecx -; X32-SSE2-NEXT: movd %ecx, %xmm0 -; X32-SSE2-NEXT: shll $15, %edx -; X32-SSE2-NEXT: movd %edx, %xmm2 -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-SSE2-NEXT: psrad $15, %xmm0 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_4i17_to_4i32: -; X32-SSE41: # %bb.0: -; X32-SSE41-NEXT: pushl %esi -; X32-SSE41-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE41-NEXT: .cfi_offset %esi, -8 -; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movl (%eax), %ecx -; X32-SSE41-NEXT: movl 4(%eax), %edx -; X32-SSE41-NEXT: movl %edx, %esi -; X32-SSE41-NEXT: movl 8(%eax), %eax -; X32-SSE41-NEXT: shldl $13, %edx, %eax -; X32-SSE41-NEXT: shldl $15, %ecx, %edx -; X32-SSE41-NEXT: shll $15, %edx -; X32-SSE41-NEXT: shll $15, %ecx -; X32-SSE41-NEXT: movd %ecx, %xmm0 -; X32-SSE41-NEXT: pinsrd $1, %edx, %xmm0 -; X32-SSE41-NEXT: shll $13, %esi -; X32-SSE41-NEXT: pinsrd $2, %esi, %xmm0 -; X32-SSE41-NEXT: shll $15, %eax -; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 -; X32-SSE41-NEXT: psrad $15, %xmm0 -; X32-SSE41-NEXT: popl %esi -; X32-SSE41-NEXT: .cfi_def_cfa_offset 4 -; X32-SSE41-NEXT: retl - %a = load <4 x i17>, <4 x i17>* %ptr - %b = sext <4 x i17> %a to <4 x i32> - ret <4 x i32> %b -} - -define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { -; SSE2-LABEL: sext_8i6_to_8i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE2-NEXT: paddw {{.*}}(%rip), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: psllq $58, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: psrad $26, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] -; SSE2-NEXT: psllq $58, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: psrad $26, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] -; SSE2-NEXT: psllq $58, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSE2-NEXT: psrad $26, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] -; SSE2-NEXT: psllq $58, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSE2-NEXT: psrad $26, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: sext_8i6_to_8i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: psllq $58, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSSE3-NEXT: psrad $26, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: psllq $58, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSSE3-NEXT: psrad $26, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: psllq $58, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSSE3-NEXT: psrad $26, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: psllq $58, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; SSSE3-NEXT: psrad $26, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: sext_8i6_to_8i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE41-NEXT: paddw {{.*}}(%rip), %xmm3 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; SSE41-NEXT: psllq $58, %xmm0 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $31, %xmm1 -; SSE41-NEXT: psrad $26, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: psllq $58, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: psrad $26, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; SSE41-NEXT: psllq $58, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: psrad $26, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; SSE41-NEXT: psllq $58, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: psrad $26, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: sext_8i6_to_8i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsllw $10, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $10, %xmm0, %xmm1 -; AVX1-NEXT: vpmovsxwq %xmm1, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] -; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] -; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: retq -; -; AVX2-LABEL: sext_8i6_to_8i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $10, %xmm0, %xmm0 -; AVX2-NEXT: vpsraw $10, %xmm0, %xmm1 -; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1 -; AVX2-NEXT: retq -; -; AVX512-LABEL: sext_8i6_to_8i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovd %edi, %xmm0 -; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: vpsllq $58, %zmm0, %zmm0 -; AVX512-NEXT: vpsraq $58, %zmm0, %zmm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: sext_8i6_to_8i64: -; X32-SSE2: # %bb.0: # %entry -; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; X32-SSE2-NEXT: paddw {{\.LCPI.*}}, %xmm3 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] -; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; X32-SSE2-NEXT: psllq $58, %xmm0 -; X32-SSE2-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE2-NEXT: psrad $31, %xmm1 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X32-SSE2-NEXT: psrad $26, %xmm0 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] -; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] -; X32-SSE2-NEXT: psllq $58, %xmm1 -; X32-SSE2-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE2-NEXT: psrad $31, %xmm2 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X32-SSE2-NEXT: psrad $26, %xmm1 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] -; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] -; X32-SSE2-NEXT: psllq $58, %xmm2 -; X32-SSE2-NEXT: movdqa %xmm2, %xmm4 -; X32-SSE2-NEXT: psrad $31, %xmm4 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; X32-SSE2-NEXT: psrad $26, %xmm2 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] -; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] -; X32-SSE2-NEXT: psllq $58, %xmm3 -; X32-SSE2-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE2-NEXT: psrad $31, %xmm4 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3] -; X32-SSE2-NEXT: psrad $26, %xmm3 -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3] -; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: sext_8i6_to_8i64: -; X32-SSE41: # %bb.0: # %entry -; X32-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; X32-SSE41-NEXT: paddw {{\.LCPI.*}}, %xmm3 -; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; X32-SSE41-NEXT: psllq $58, %xmm0 -; X32-SSE41-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE41-NEXT: psrad $31, %xmm1 -; X32-SSE41-NEXT: psrad $26, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; X32-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] -; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; X32-SSE41-NEXT: psllq $58, %xmm1 -; X32-SSE41-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE41-NEXT: psrad $31, %xmm2 -; X32-SSE41-NEXT: psrad $26, %xmm1 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X32-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; X32-SSE41-NEXT: psllq $58, %xmm2 -; X32-SSE41-NEXT: movdqa %xmm2, %xmm4 -; X32-SSE41-NEXT: psrad $31, %xmm4 -; X32-SSE41-NEXT: psrad $26, %xmm2 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; X32-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; X32-SSE41-NEXT: psllq $58, %xmm3 -; X32-SSE41-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE41-NEXT: psrad $31, %xmm4 -; X32-SSE41-NEXT: psrad $26, %xmm3 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; X32-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; X32-SSE41-NEXT: retl -entry: - %a = trunc i32 %x to i6 - %b = insertelement <8 x i6> undef, i6 %a, i32 0 - %c = shufflevector <8 x i6> %b, <8 x i6> undef, <8 x i32> zeroinitializer - %d = add <8 x i6> %c, - %e = sext <8 x i6> %d to <8 x i64> - ret <8 x i64> %e -} - -define <8 x i32> @zext_negate_sext(<8 x i8> %x) { -; SSE2-LABEL: zext_negate_sext: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: psubw %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_negate_sext: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: psubw %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_negate_sext: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: psubw %xmm0, %xmm1 -; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_negate_sext: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_negate_sext: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_negate_sext: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpsubd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: zext_negate_sext: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: pxor %xmm1, %xmm1 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X32-SSE2-NEXT: psubw %xmm0, %xmm1 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X32-SSE2-NEXT: psrad $16, %xmm0 -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: psrad $16, %xmm1 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: zext_negate_sext: -; X32-SSE41: # %bb.0: -; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X32-SSE41-NEXT: pxor %xmm1, %xmm1 -; X32-SSE41-NEXT: psubw %xmm0, %xmm1 -; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm1 -; X32-SSE41-NEXT: retl - %z = zext <8 x i8> %x to <8 x i16> - %neg = sub nsw <8 x i16> zeroinitializer, %z - %r = sext <8 x i16> %neg to <8 x i32> - ret <8 x i32> %r -} - -define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) { -; SSE2-LABEL: zext_decremenet_sext: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE2-NEXT: paddw %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_decremenet_sext: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 -; SSSE3-NEXT: paddw %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_decremenet_sext: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: paddw %xmm0, %xmm1 -; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_decremenet_sext: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_decremenet_sext: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_decremenet_sext: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: retq -; -; X32-SSE2-LABEL: zext_decremenet_sext: -; X32-SSE2: # %bb.0: -; X32-SSE2-NEXT: pxor %xmm1, %xmm1 -; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X32-SSE2-NEXT: pcmpeqd %xmm1, %xmm1 -; X32-SSE2-NEXT: paddw %xmm0, %xmm1 -; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X32-SSE2-NEXT: psrad $16, %xmm0 -; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; X32-SSE2-NEXT: psrad $16, %xmm1 -; X32-SSE2-NEXT: retl -; -; X32-SSE41-LABEL: zext_decremenet_sext: -; X32-SSE41: # %bb.0: -; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X32-SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; X32-SSE41-NEXT: paddw %xmm0, %xmm1 -; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm1 -; X32-SSE41-NEXT: retl - %z = zext <8 x i8> %x to <8 x i16> - %dec = add <8 x i16> %z, - %r = sext <8 x i16> %dec to <8 x i32> - ret <8 x i32> %r -} diff --git a/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll b/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll deleted file mode 100644 index cd7902ea98d..00000000000 --- a/test/CodeGen/X86/vector-shift-ashr-sub128-widen.ll +++ /dev/null @@ -1,2481 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL -; -; Just one 32-bit run to make sure we do reasonable things for i64 shifts. -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 - -; -; Variable Shifts -; - -define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { -; SSE2-LABEL: var_shift_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad %xmm2, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrad %xmm3, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrad %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psrad %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad %xmm1, %xmm3 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] -; SSE41-NEXT: psrad %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: var_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: var_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512-LABEL: var_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: var_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrad %xmm2, %xmm3 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrad %xmm4, %xmm2 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm4 -; X32-SSE-NEXT: psrad %xmm3, %xmm4 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: psrad %xmm1, %xmm0 -; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] -; X32-SSE-NEXT: movaps %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <2 x i32> %a, %b - ret <2 x i32> %shift -} - -define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { -; SSE2-LABEL: var_shift_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psraw $1, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psraw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 -; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: var_shift_v4i16: -; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v4i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v4i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v4i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v4i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v4i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $12, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psraw $8, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psraw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psraw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: psraw $15, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: pandn %xmm0, %xmm2 -; X32-SSE-NEXT: psraw $1, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: por %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <4 x i16> %a, %b - ret <4 x i16> %shift -} - -define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { -; SSE2-LABEL: var_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psraw $1, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psraw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3 -; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: var_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v2i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v2i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $12, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psraw $8, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psraw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psraw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: psraw $15, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: pandn %xmm0, %xmm2 -; X32-SSE-NEXT: psraw $1, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: por %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <2 x i16> %a, %b - ret <2 x i16> %shift -} - -define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: psraw $4, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: psraw $2, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: psraw $1, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psraw $1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $4, %xmm4 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $2, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $1, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: var_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: var_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v8i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v8i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v8i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; X32-SSE-NEXT: psllw $5, %xmm1 -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pxor %xmm5, %xmm5 -; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 -; X32-SSE-NEXT: movdqa %xmm5, %xmm6 -; X32-SSE-NEXT: pandn %xmm2, %xmm6 -; X32-SSE-NEXT: psraw $4, %xmm2 -; X32-SSE-NEXT: pand %xmm5, %xmm2 -; X32-SSE-NEXT: por %xmm6, %xmm2 -; X32-SSE-NEXT: paddw %xmm4, %xmm4 -; X32-SSE-NEXT: pxor %xmm5, %xmm5 -; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 -; X32-SSE-NEXT: movdqa %xmm5, %xmm6 -; X32-SSE-NEXT: pandn %xmm2, %xmm6 -; X32-SSE-NEXT: psraw $2, %xmm2 -; X32-SSE-NEXT: pand %xmm5, %xmm2 -; X32-SSE-NEXT: por %xmm6, %xmm2 -; X32-SSE-NEXT: paddw %xmm4, %xmm4 -; X32-SSE-NEXT: pxor %xmm5, %xmm5 -; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 -; X32-SSE-NEXT: movdqa %xmm5, %xmm4 -; X32-SSE-NEXT: pandn %xmm2, %xmm4 -; X32-SSE-NEXT: psraw $1, %xmm2 -; X32-SSE-NEXT: pand %xmm5, %xmm2 -; X32-SSE-NEXT: por %xmm4, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 -; X32-SSE-NEXT: movdqa %xmm4, %xmm5 -; X32-SSE-NEXT: pandn %xmm0, %xmm5 -; X32-SSE-NEXT: psraw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm4, %xmm0 -; X32-SSE-NEXT: por %xmm5, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 -; X32-SSE-NEXT: movdqa %xmm4, %xmm5 -; X32-SSE-NEXT: pandn %xmm0, %xmm5 -; X32-SSE-NEXT: psraw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm4, %xmm0 -; X32-SSE-NEXT: por %xmm5, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm1 -; X32-SSE-NEXT: pandn %xmm0, %xmm1 -; X32-SSE-NEXT: psraw $1, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: packuswb %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <8 x i8> %a, %b - ret <8 x i8> %shift -} - -define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: psraw $4, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: psraw $2, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: psraw $1, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psraw $1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $4, %xmm4 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $2, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $1, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: var_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: var_shift_v4i8: -; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v4i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v4i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v4i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; X32-SSE-NEXT: psllw $5, %xmm1 -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pxor %xmm5, %xmm5 -; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 -; X32-SSE-NEXT: movdqa %xmm5, %xmm6 -; X32-SSE-NEXT: pandn %xmm2, %xmm6 -; X32-SSE-NEXT: psraw $4, %xmm2 -; X32-SSE-NEXT: pand %xmm5, %xmm2 -; X32-SSE-NEXT: por %xmm6, %xmm2 -; X32-SSE-NEXT: paddw %xmm4, %xmm4 -; X32-SSE-NEXT: pxor %xmm5, %xmm5 -; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 -; X32-SSE-NEXT: movdqa %xmm5, %xmm6 -; X32-SSE-NEXT: pandn %xmm2, %xmm6 -; X32-SSE-NEXT: psraw $2, %xmm2 -; X32-SSE-NEXT: pand %xmm5, %xmm2 -; X32-SSE-NEXT: por %xmm6, %xmm2 -; X32-SSE-NEXT: paddw %xmm4, %xmm4 -; X32-SSE-NEXT: pxor %xmm5, %xmm5 -; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 -; X32-SSE-NEXT: movdqa %xmm5, %xmm4 -; X32-SSE-NEXT: pandn %xmm2, %xmm4 -; X32-SSE-NEXT: psraw $1, %xmm2 -; X32-SSE-NEXT: pand %xmm5, %xmm2 -; X32-SSE-NEXT: por %xmm4, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 -; X32-SSE-NEXT: movdqa %xmm4, %xmm5 -; X32-SSE-NEXT: pandn %xmm0, %xmm5 -; X32-SSE-NEXT: psraw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm4, %xmm0 -; X32-SSE-NEXT: por %xmm5, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 -; X32-SSE-NEXT: movdqa %xmm4, %xmm5 -; X32-SSE-NEXT: pandn %xmm0, %xmm5 -; X32-SSE-NEXT: psraw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm4, %xmm0 -; X32-SSE-NEXT: por %xmm5, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm1 -; X32-SSE-NEXT: pandn %xmm0, %xmm1 -; X32-SSE-NEXT: psraw $1, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: packuswb %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <4 x i8> %a, %b - ret <4 x i8> %shift -} - -define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: psraw $4, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: psraw $2, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: paddw %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtw %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pandn %xmm2, %xmm4 -; SSE2-NEXT: psraw $1, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: por %xmm4, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $4, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pandn %xmm0, %xmm5 -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtw %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psraw $1, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $4, %xmm4 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $2, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psraw $1, %xmm4 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $4, %xmm2 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $2, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psraw $1, %xmm2 -; SSE41-NEXT: paddw %xmm0, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: var_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX-NEXT: vpsraw $4, %xmm3, %xmm4 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpsraw $2, %xmm3, %xmm4 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3 -; AVX-NEXT: vpsraw $1, %xmm3, %xmm4 -; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2 -; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX-NEXT: vpsraw $4, %xmm0, %xmm3 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $2, %xmm0, %xmm3 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsraw $1, %xmm0, %xmm3 -; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: var_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v2i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] -; X32-SSE-NEXT: psllw $5, %xmm1 -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15] -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pxor %xmm5, %xmm5 -; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 -; X32-SSE-NEXT: movdqa %xmm5, %xmm6 -; X32-SSE-NEXT: pandn %xmm2, %xmm6 -; X32-SSE-NEXT: psraw $4, %xmm2 -; X32-SSE-NEXT: pand %xmm5, %xmm2 -; X32-SSE-NEXT: por %xmm6, %xmm2 -; X32-SSE-NEXT: paddw %xmm4, %xmm4 -; X32-SSE-NEXT: pxor %xmm5, %xmm5 -; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 -; X32-SSE-NEXT: movdqa %xmm5, %xmm6 -; X32-SSE-NEXT: pandn %xmm2, %xmm6 -; X32-SSE-NEXT: psraw $2, %xmm2 -; X32-SSE-NEXT: pand %xmm5, %xmm2 -; X32-SSE-NEXT: por %xmm6, %xmm2 -; X32-SSE-NEXT: paddw %xmm4, %xmm4 -; X32-SSE-NEXT: pxor %xmm5, %xmm5 -; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5 -; X32-SSE-NEXT: movdqa %xmm5, %xmm4 -; X32-SSE-NEXT: pandn %xmm2, %xmm4 -; X32-SSE-NEXT: psraw $1, %xmm2 -; X32-SSE-NEXT: pand %xmm5, %xmm2 -; X32-SSE-NEXT: por %xmm4, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 -; X32-SSE-NEXT: movdqa %xmm4, %xmm5 -; X32-SSE-NEXT: pandn %xmm0, %xmm5 -; X32-SSE-NEXT: psraw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm4, %xmm0 -; X32-SSE-NEXT: por %xmm5, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm4, %xmm4 -; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4 -; X32-SSE-NEXT: movdqa %xmm4, %xmm5 -; X32-SSE-NEXT: pandn %xmm0, %xmm5 -; X32-SSE-NEXT: psraw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm4, %xmm0 -; X32-SSE-NEXT: por %xmm5, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm1 -; X32-SSE-NEXT: pandn %xmm0, %xmm1 -; X32-SSE-NEXT: psraw $1, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: packuswb %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <2 x i8> %a, %b - ret <2 x i8> %shift -} - -; -; Uniform Variable Shifts -; - -define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE2-NEXT: psrad %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: psrad %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: splatvar_shift_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatvar_shift_v2i32: -; XOP: # %bb.0: -; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatvar_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatvar_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: xorps %xmm2, %xmm2 -; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X32-SSE-NEXT: psrad %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer - %shift = ashr <2 x i32> %a, %splat - ret <2 x i32> %shift -} - -define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psraw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: psraw %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: splatvar_shift_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatvar_shift_v4i16: -; XOP: # %bb.0: -; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatvar_shift_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatvar_shift_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v4i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psraw %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer - %shift = ashr <4 x i16> %a, %splat - ret <4 x i16> %shift -} - -define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psraw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: psraw %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: splatvar_shift_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatvar_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatvar_shift_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatvar_shift_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psraw %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer - %shift = ashr <2 x i16> %a, %splat - ret <2 x i16> %shift -} - -define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: psubb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psrlw %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psubb %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatvar_shift_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v8i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: splatvar_shift_v8i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512DQ-LABEL: splatvar_shift_v8i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: splatvar_shift_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: splatvar_shift_v8i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatvar_shift_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v8i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm1, %xmm0 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: psrlw %xmm1, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; X32-SSE-NEXT: psrlw %xmm1, %xmm2 -; X32-SSE-NEXT: pxor %xmm2, %xmm0 -; X32-SSE-NEXT: psubb %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer - %shift = ashr <8 x i8> %a, %splat - ret <8 x i8> %shift -} - -define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: psubb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psrlw %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psubb %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatvar_shift_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v4i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: splatvar_shift_v4i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512DQ-LABEL: splatvar_shift_v4i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: splatvar_shift_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: splatvar_shift_v4i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatvar_shift_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v4i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm1, %xmm0 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: psrlw %xmm1, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; X32-SSE-NEXT: psrlw %xmm1, %xmm2 -; X32-SSE-NEXT: pxor %xmm2, %xmm0 -; X32-SSE-NEXT: psubb %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer - %shift = ashr <4 x i8> %a, %splat - ret <4 x i8> %shift -} - -define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: psubb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psrlw %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm2, %xmm0 -; SSE41-NEXT: psubb %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatvar_shift_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2 -; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOP-LABEL: splatvar_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: splatvar_shift_v2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: splatvar_shift_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: splatvar_shift_v2i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatvar_shift_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v2i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm1, %xmm0 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: psrlw %xmm1, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] -; X32-SSE-NEXT: psrlw %xmm1, %xmm2 -; X32-SSE-NEXT: pxor %xmm2, %xmm0 -; X32-SSE-NEXT: psubb %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer - %shift = ashr <2 x i8> %a, %splat - ret <2 x i8> %shift -} - -; -; Constant Shifts -; - -define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { -; SSE2-LABEL: constant_shift_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $4, %xmm1 -; SSE2-NEXT: psrad $5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrad $5, %xmm1 -; SSE41-NEXT: psrad $4, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: constant_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 -; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: constant_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: constant_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512-LABEL: constant_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: constant_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrad $4, %xmm1 -; X32-SSE-NEXT: psrad $5, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <2 x i32> %a, - ret <2 x i32> %shift -} - -define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { -; SSE2-LABEL: constant_shift_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psraw $2, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: andps %xmm2, %xmm0 -; SSE2-NEXT: psraw $1, %xmm1 -; SSE2-NEXT: andnps %xmm1, %xmm2 -; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = -; SSE41-NEXT: pmulhw %xmm0, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE41-NEXT: psraw $1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: retq -; -; AVX-LABEL: constant_shift_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpsraw $1, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: retq -; -; XOP-LABEL: constant_shift_v4i16: -; XOP: # %bb.0: -; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v4i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v4i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v4i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0 -; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v4i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v4i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psraw $2, %xmm1 -; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; X32-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535] -; X32-SSE-NEXT: movaps %xmm1, %xmm0 -; X32-SSE-NEXT: andps %xmm2, %xmm0 -; X32-SSE-NEXT: psraw $1, %xmm1 -; X32-SSE-NEXT: andnps %xmm1, %xmm2 -; X32-SSE-NEXT: orps %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <4 x i16> %a, - ret <4 x i16> %shift -} - -define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { -; SSE2-LABEL: constant_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psraw $3, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: psraw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psraw $3, %xmm1 -; SSE41-NEXT: psraw $2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; SSE41-NEXT: retq -; -; AVX-LABEL: constant_shift_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsraw $3, %xmm0, %xmm1 -; AVX-NEXT: vpsraw $2, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: retq -; -; XOP-LABEL: constant_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsraw $3, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpsraw $2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v2i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsraw $3, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpsraw $2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v2i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psraw $3, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; X32-SSE-NEXT: psraw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pandn %xmm1, %xmm2 -; X32-SSE-NEXT: por %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <2 x i16> %a, - ret <2 x i16> %shift -} - -define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { -; SSE-LABEL: constant_shift_v8i8: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: constant_shift_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: constant_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v8i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v8i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v8i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: psraw $8, %xmm0 -; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: packuswb %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <8 x i8> %a, - ret <8 x i8> %shift -} - -define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { -; SSE-LABEL: constant_shift_v4i8: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: constant_shift_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: constant_shift_v4i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v4i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v4i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v4i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: psraw $8, %xmm0 -; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: packuswb %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <4 x i8> %a, - ret <4 x i8> %shift -} - -define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { -; SSE-LABEL: constant_shift_v2i8: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: constant_shift_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: constant_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v2i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0 -; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v2i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: psraw $8, %xmm0 -; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: packuswb %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <2 x i8> %a, - ret <2 x i8> %shift -} - -; -; Uniform Constant Shifts -; - -define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v2i32: -; SSE: # %bb.0: -; SSE-NEXT: psrad $5, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpsrad $5, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v2i32: -; XOP: # %bb.0: -; XOP-NEXT: vpsrad $5, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrad $5, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psrad $5, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <2 x i32> %a, - ret <2 x i32> %shift -} - -define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: psraw $3, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v4i16: -; XOP: # %bb.0: -; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v4i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psraw $3, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <4 x i16> %a, - ret <4 x i16> %shift -} - -define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v2i16: -; SSE: # %bb.0: -; SSE-NEXT: psraw $3, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsraw $3, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpsraw $3, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psraw $3, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <2 x i16> %a, - ret <2 x i16> %shift -} - -define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v8i8: -; SSE: # %bb.0: -; SSE-NEXT: psrlw $3, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v8i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psrlw $3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: pxor %xmm1, %xmm0 -; X32-SSE-NEXT: psubb %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <8 x i8> %a, - ret <8 x i8> %shift -} - -define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v4i8: -; SSE: # %bb.0: -; SSE-NEXT: psrlw $3, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v4i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v4i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psrlw $3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: pxor %xmm1, %xmm0 -; X32-SSE-NEXT: psubb %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <4 x i8> %a, - ret <4 x i8> %shift -} - -define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v2i8: -; SSE: # %bb.0: -; SSE-NEXT: psrlw $3, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; SSE-NEXT: pxor %xmm1, %xmm0 -; SSE-NEXT: psubb %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v2i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psrlw $3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: pxor %xmm1, %xmm0 -; X32-SSE-NEXT: psubb %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = ashr <2 x i8> %a, - ret <2 x i8> %shift -} diff --git a/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll b/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll deleted file mode 100644 index 79f8cc6a05f..00000000000 --- a/test/CodeGen/X86/vector-shift-lshr-sub128-widen.ll +++ /dev/null @@ -1,2151 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL -; -; Just one 32-bit run to make sure we do reasonable things for i64 shifts. -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 - -; -; Variable Shifts -; - -define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { -; SSE2-LABEL: var_shift_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrld %xmm2, %xmm3 -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrld %xmm4, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: psrld %xmm3, %xmm4 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE2-NEXT: psrld %xmm1, %xmm0 -; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] -; SSE2-NEXT: movaps %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm2, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm5 -; SSE41-NEXT: psrld %xmm4, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7] -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrld %xmm1, %xmm3 -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7] -; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] -; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: var_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: var_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512-LABEL: var_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: var_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm3 -; X32-SSE-NEXT: psrld %xmm2, %xmm3 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: psrld %xmm4, %xmm2 -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7] -; X32-SSE-NEXT: movdqa %xmm0, %xmm4 -; X32-SSE-NEXT: psrld %xmm3, %xmm4 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7] -; X32-SSE-NEXT: psrld %xmm1, %xmm0 -; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1] -; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3] -; X32-SSE-NEXT: movaps %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <2 x i32> %a, %b - ret <2 x i32> %shift -} - -define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { -; SSE2-LABEL: var_shift_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psrlw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $2, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $1, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 -; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: var_shift_v4i16: -; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v4i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v4i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v4i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v4i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v4i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $12, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psrlw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: psraw $15, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: pandn %xmm0, %xmm2 -; X32-SSE-NEXT: psrlw $1, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: por %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <4 x i16> %a, %b - ret <4 x i16> %shift -} - -define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { -; SSE2-LABEL: var_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psraw $15, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: paddw %xmm1, %xmm1 -; SSE2-NEXT: psraw $15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: psllw $12, %xmm0 -; SSE41-NEXT: psllw $4, %xmm2 -; SSE41-NEXT: por %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddw %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm4 -; SSE41-NEXT: psrlw $8, %xmm4 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $4, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $2, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm2 -; SSE41-NEXT: psrlw $1, %xmm2 -; SSE41-NEXT: paddw %xmm3, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2 -; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3 -; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 -; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: var_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v2i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v2i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $12, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: psraw $15, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm3 -; X32-SSE-NEXT: pandn %xmm0, %xmm3 -; X32-SSE-NEXT: psrlw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm3, %xmm0 -; X32-SSE-NEXT: paddw %xmm1, %xmm1 -; X32-SSE-NEXT: psraw $15, %xmm1 -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: pandn %xmm0, %xmm2 -; X32-SSE-NEXT: psrlw $1, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: por %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <2 x i16> %a, %b - ret <2 x i16> %shift -} - -define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrlw $4, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlw $2, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlw $1, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: var_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: var_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v8i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v8i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v8i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $5, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pandn %xmm0, %xmm4 -; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm4, %xmm0 -; X32-SSE-NEXT: paddb %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pandn %xmm0, %xmm4 -; X32-SSE-NEXT: psrlw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm4, %xmm0 -; X32-SSE-NEXT: paddb %xmm1, %xmm1 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE-NEXT: pandn %xmm0, %xmm1 -; X32-SSE-NEXT: psrlw $1, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <8 x i8> %a, %b - ret <8 x i8> %shift -} - -define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrlw $4, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlw $2, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlw $1, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: var_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: var_shift_v4i8: -; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v4i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v4i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v4i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $5, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pandn %xmm0, %xmm4 -; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm4, %xmm0 -; X32-SSE-NEXT: paddb %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pandn %xmm0, %xmm4 -; X32-SSE-NEXT: psrlw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm4, %xmm0 -; X32-SSE-NEXT: paddb %xmm1, %xmm1 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE-NEXT: pandn %xmm0, %xmm1 -; X32-SSE-NEXT: psrlw $1, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <4 x i8> %a, %b - ret <4 x i8> %shift -} - -define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrlw $4, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlw $2, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrlw $1, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: var_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: var_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v2i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $5, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pandn %xmm0, %xmm4 -; X32-SSE-NEXT: psrlw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm4, %xmm0 -; X32-SSE-NEXT: paddb %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pandn %xmm0, %xmm4 -; X32-SSE-NEXT: psrlw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm4, %xmm0 -; X32-SSE-NEXT: paddb %xmm1, %xmm1 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE-NEXT: pandn %xmm0, %xmm1 -; X32-SSE-NEXT: psrlw $1, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <2 x i8> %a, %b - ret <2 x i8> %shift -} - -; -; Uniform Variable Shifts -; - -define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE2-NEXT: psrld %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: psrld %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: splatvar_shift_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatvar_shift_v2i32: -; XOP: # %bb.0: -; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatvar_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatvar_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: xorps %xmm2, %xmm2 -; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X32-SSE-NEXT: psrld %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer - %shift = lshr <2 x i32> %a, %splat - ret <2 x i32> %shift -} - -define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: psrlw %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: splatvar_shift_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatvar_shift_v4i16: -; XOP: # %bb.0: -; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatvar_shift_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatvar_shift_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v4i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer - %shift = lshr <4 x i16> %a, %splat - ret <4 x i16> %shift -} - -define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: psrlw %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: splatvar_shift_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatvar_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatvar_shift_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatvar_shift_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer - %shift = lshr <2 x i16> %a, %splat - ret <2 x i16> %shift -} - -define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psrlw %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatvar_shift_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v8i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: splatvar_shift_v8i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512DQ-LABEL: splatvar_shift_v8i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: splatvar_shift_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: splatvar_shift_v8i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatvar_shift_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v8i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm1, %xmm0 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: psrlw %xmm1, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer - %shift = lshr <8 x i8> %a, %splat - ret <8 x i8> %shift -} - -define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psrlw %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatvar_shift_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v4i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: splatvar_shift_v4i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512DQ-LABEL: splatvar_shift_v4i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: splatvar_shift_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: splatvar_shift_v4i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatvar_shift_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v4i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm1, %xmm0 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: psrlw %xmm1, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer - %shift = lshr <4 x i8> %a, %splat - ret <4 x i8> %shift -} - -define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: psrlw %xmm1, %xmm2 -; SSE2-NEXT: psrlw $8, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psrlw %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: psrlw %xmm1, %xmm2 -; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatvar_shift_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOP-LABEL: splatvar_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] -; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1 -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: splatvar_shift_v2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: splatvar_shift_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: splatvar_shift_v2i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatvar_shift_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v2i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm1, %xmm0 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: psrlw %xmm1, %xmm2 -; X32-SSE-NEXT: psrlw $8, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer - %shift = lshr <2 x i8> %a, %splat - ret <2 x i8> %shift -} - -; -; Constant Shifts -; - -define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { -; SSE2-LABEL: constant_shift_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $4, %xmm1 -; SSE2-NEXT: psrld $5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $5, %xmm1 -; SSE41-NEXT: psrld $4, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: constant_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1 -; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: constant_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: constant_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512-LABEL: constant_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: constant_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrld $4, %xmm1 -; X32-SSE-NEXT: psrld $5, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <2 x i32> %a, - ret <2 x i32> %shift -} - -define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { -; SSE2-LABEL: constant_shift_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = -; SSE41-NEXT: pmulhuw %xmm0, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE41-NEXT: retq -; -; AVX-LABEL: constant_shift_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: retq -; -; XOP-LABEL: constant_shift_v4i16: -; XOP: # %bb.0: -; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v4i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v4i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v4i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v4i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v4i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535] -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: pandn %xmm0, %xmm2 -; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: por %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <4 x i16> %a, - ret <4 x i16> %shift -} - -define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { -; SSE2-LABEL: constant_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrlw $3, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: psrlw $2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm1, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrlw $3, %xmm1 -; SSE41-NEXT: psrlw $2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; SSE41-NEXT: retq -; -; AVX-LABEL: constant_shift_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1 -; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: retq -; -; XOP-LABEL: constant_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsrlw $3, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpsrlw $2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v2i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpsrlw $2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v2i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: psrlw $3, %xmm1 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535] -; X32-SSE-NEXT: psrlw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: pandn %xmm1, %xmm2 -; X32-SSE-NEXT: por %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <2 x i16> %a, - ret <2 x i16> %shift -} - -define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { -; SSE2-LABEL: constant_shift_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: constant_shift_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: constant_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v8i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v8i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v8i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: packuswb %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <8 x i8> %a, - ret <8 x i8> %shift -} - -define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { -; SSE2-LABEL: constant_shift_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: constant_shift_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: constant_shift_v4i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v4i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v4i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v4i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: packuswb %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <4 x i8> %a, - ret <4 x i8> %shift -} - -define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { -; SSE2-LABEL: constant_shift_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: psrlw $8, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1 -; SSE41-NEXT: psrlw $8, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: constant_shift_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: constant_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v2i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v2i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: movdqa %xmm0, %xmm2 -; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: psrlw $8, %xmm0 -; X32-SSE-NEXT: packuswb %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <2 x i8> %a, - ret <2 x i8> %shift -} - -; -; Uniform Constant Shifts -; - -define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v2i32: -; SSE: # %bb.0: -; SSE-NEXT: psrld $5, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpsrld $5, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v2i32: -; XOP: # %bb.0: -; XOP-NEXT: vpsrld $5, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrld $5, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psrld $5, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <2 x i32> %a, - ret <2 x i32> %shift -} - -define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: psrlw $3, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v4i16: -; XOP: # %bb.0: -; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v4i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psrlw $3, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <4 x i16> %a, - ret <4 x i16> %shift -} - -define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v2i16: -; SSE: # %bb.0: -; SSE-NEXT: psrlw $3, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psrlw $3, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <2 x i16> %a, - ret <2 x i16> %shift -} - -define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v8i8: -; SSE: # %bb.0: -; SSE-NEXT: psrlw $3, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v8i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psrlw $3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <8 x i8> %a, - ret <8 x i8> %shift -} - -define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v4i8: -; SSE: # %bb.0: -; SSE-NEXT: psrlw $3, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v4i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v4i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psrlw $3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <4 x i8> %a, - ret <4 x i8> %shift -} - -define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v2i8: -; SSE: # %bb.0: -; SSE-NEXT: psrlw $3, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v2i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psrlw $3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: retl - %shift = lshr <2 x i8> %a, - ret <2 x i8> %shift -} diff --git a/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll b/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll deleted file mode 100644 index 9222dc09400..00000000000 --- a/test/CodeGen/X86/vector-shift-shl-sub128-widen.ll +++ /dev/null @@ -1,1940 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL -; -; Just one 32-bit run to make sure we do reasonable things for i64 shifts. -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2 - -; -; Variable Shifts -; - -define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { -; SSE2-LABEL: var_shift_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1 -; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pmulld %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: var_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: var_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512-LABEL: var_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: var_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslld $23, %xmm1 -; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1 -; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X32-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE-NEXT: retl - %shift = shl <2 x i32> %a, %b - ret <2 x i32> %shift -} - -define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { -; SSE2-LABEL: var_shift_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm3, %xmm1 -; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE41-NEXT: packusdw %xmm1, %xmm2 -; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v4i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v4i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: var_shift_v4i16: -; XOP: # %bb.0: -; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v4i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v4i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v4i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v4i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v4i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE-NEXT: pslld $23, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; X32-SSE-NEXT: paddd %xmm3, %xmm2 -; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE-NEXT: pslld $23, %xmm1 -; X32-SSE-NEXT: paddd %xmm3, %xmm1 -; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X32-SSE-NEXT: pmullw %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <4 x i16> %a, %b - ret <4 x i16> %shift -} - -define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { -; SSE2-LABEL: var_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: pslld $23, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE2-NEXT: paddd %xmm3, %xmm2 -; SSE2-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pslld $23, %xmm1 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: pmullw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE41-NEXT: pslld $23, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; SSE41-NEXT: paddd %xmm3, %xmm1 -; SSE41-NEXT: cvttps2dq %xmm1, %xmm1 -; SSE41-NEXT: pslld $23, %xmm2 -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: cvttps2dq %xmm2, %xmm2 -; SSE41-NEXT: packusdw %xmm1, %xmm2 -; SSE41-NEXT: pmullw %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: var_shift_v2i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX1-NEXT: vpslld $23, %xmm2, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX1-NEXT: vpslld $23, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: var_shift_v2i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: var_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v2i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v2i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm1, %xmm2 -; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; X32-SSE-NEXT: pslld $23, %xmm2 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] -; X32-SSE-NEXT: paddd %xmm3, %xmm2 -; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; X32-SSE-NEXT: pslld $23, %xmm1 -; X32-SSE-NEXT: paddd %xmm3, %xmm1 -; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1 -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; X32-SSE-NEXT: pmullw %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <2 x i16> %a, %b - ret <2 x i16> %shift -} - -define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllw $4, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psllw $2, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: var_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: var_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v8i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v8i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v8i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $5, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pandn %xmm0, %xmm4 -; X32-SSE-NEXT: psllw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm4, %xmm0 -; X32-SSE-NEXT: paddb %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pandn %xmm0, %xmm4 -; X32-SSE-NEXT: psllw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm4, %xmm0 -; X32-SSE-NEXT: paddb %xmm1, %xmm1 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE-NEXT: pandn %xmm0, %xmm1 -; X32-SSE-NEXT: paddb %xmm0, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <8 x i8> %a, %b - ret <8 x i8> %shift -} - -define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllw $4, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psllw $2, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: var_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: var_shift_v4i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v4i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v4i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v4i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $5, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pandn %xmm0, %xmm4 -; X32-SSE-NEXT: psllw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm4, %xmm0 -; X32-SSE-NEXT: paddb %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pandn %xmm0, %xmm4 -; X32-SSE-NEXT: psllw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm4, %xmm0 -; X32-SSE-NEXT: paddb %xmm1, %xmm1 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE-NEXT: pandn %xmm0, %xmm1 -; X32-SSE-NEXT: paddb %xmm0, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <4 x i8> %a, %b - ret <4 x i8> %shift -} - -define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { -; SSE2-LABEL: var_shift_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: psllw $5, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $4, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: psllw $2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: paddb %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: var_shift_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psllw $5, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psllw $4, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psllw $2, %xmm3 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm2, %xmm3 -; SSE41-NEXT: paddb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: var_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $5, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm2 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2 -; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: var_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: var_shift_v2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: var_shift_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: var_shift_v2i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: var_shift_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: var_shift_v2i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $5, %xmm1 -; X32-SSE-NEXT: pxor %xmm2, %xmm2 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pandn %xmm0, %xmm4 -; X32-SSE-NEXT: psllw $4, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm4, %xmm0 -; X32-SSE-NEXT: paddb %xmm1, %xmm1 -; X32-SSE-NEXT: pxor %xmm3, %xmm3 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa %xmm3, %xmm4 -; X32-SSE-NEXT: pandn %xmm0, %xmm4 -; X32-SSE-NEXT: psllw $2, %xmm0 -; X32-SSE-NEXT: pand %xmm3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: por %xmm4, %xmm0 -; X32-SSE-NEXT: paddb %xmm1, %xmm1 -; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2 -; X32-SSE-NEXT: movdqa %xmm2, %xmm1 -; X32-SSE-NEXT: pandn %xmm0, %xmm1 -; X32-SSE-NEXT: paddb %xmm0, %xmm0 -; X32-SSE-NEXT: pand %xmm2, %xmm0 -; X32-SSE-NEXT: por %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <2 x i8> %a, %b - ret <2 x i8> %shift -} - -; -; Uniform Variable Shifts -; - -define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; SSE2-NEXT: pslld %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: pslld %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: splatvar_shift_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatvar_shift_v2i32: -; XOP: # %bb.0: -; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatvar_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatvar_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero -; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: xorps %xmm2, %xmm2 -; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] -; X32-SSE-NEXT: pslld %xmm2, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer - %shift = shl <2 x i32> %a, %splat - ret <2 x i32> %shift -} - -define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v4i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psllw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v4i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: psllw %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: splatvar_shift_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatvar_shift_v4i16: -; XOP: # %bb.0: -; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatvar_shift_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatvar_shift_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v4i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psllw %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer - %shift = shl <4 x i16> %a, %splat - ret <4 x i16> %shift -} - -define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psllw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: psllw %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: splatvar_shift_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatvar_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatvar_shift_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatvar_shift_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psllw %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer - %shift = shl <2 x i16> %a, %splat - ret <2 x i16> %shift -} - -define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psllw %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: psllw %xmm1, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psllw %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: psllw %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pshufb %xmm1, %xmm2 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatvar_shift_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v8i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: splatvar_shift_v8i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512DQ-LABEL: splatvar_shift_v8i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: splatvar_shift_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: splatvar_shift_v8i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatvar_shift_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v8i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psllw %xmm1, %xmm0 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: psllw %xmm1, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer - %shift = shl <8 x i8> %a, %splat - ret <8 x i8> %shift -} - -define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psllw %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: psllw %xmm1, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psllw %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: psllw %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pshufb %xmm1, %xmm2 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatvar_shift_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: splatvar_shift_v4i8: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7] -; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: splatvar_shift_v4i8: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512DQ-LABEL: splatvar_shift_v4i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: splatvar_shift_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: splatvar_shift_v4i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatvar_shift_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v4i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psllw %xmm1, %xmm0 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: psllw %xmm1, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer - %shift = shl <4 x i8> %a, %splat - ret <4 x i8> %shift -} - -define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { -; SSE2-LABEL: splatvar_shift_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psllw %xmm1, %xmm0 -; SSE2-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE2-NEXT: psllw %xmm1, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: splatvar_shift_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psllw %xmm1, %xmm0 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: psllw %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pshufb %xmm1, %xmm2 -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatvar_shift_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatvar_shift_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOP-LABEL: splatvar_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u] -; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: splatvar_shift_v2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: splatvar_shift_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: splatvar_shift_v2i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: splatvar_shift_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: splatvar_shift_v2i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psllw %xmm1, %xmm0 -; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; X32-SSE-NEXT: psllw %xmm1, %xmm2 -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-SSE-NEXT: pand %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer - %shift = shl <2 x i8> %a, %splat - ret <2 x i8> %shift -} - -; -; Constant Shifts -; - -define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind { -; SSE2-LABEL: constant_shift_v2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pslld $4, %xmm1 -; SSE2-NEXT: pslld $5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pslld $5, %xmm1 -; SSE41-NEXT: pslld $4, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: constant_shift_v2i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpslld $5, %xmm0, %xmm1 -; AVX1-NEXT: vpslld $4, %xmm0, %xmm0 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v2i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; XOPAVX1-LABEL: constant_shift_v2i32: -; XOPAVX1: # %bb.0: -; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX1-NEXT: retq -; -; XOPAVX2-LABEL: constant_shift_v2i32: -; XOPAVX2: # %bb.0: -; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; XOPAVX2-NEXT: retq -; -; AVX512-LABEL: constant_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: constant_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: movdqa %xmm0, %xmm1 -; X32-SSE-NEXT: pslld $4, %xmm1 -; X32-SSE-NEXT: pslld $5, %xmm0 -; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; X32-SSE-NEXT: movdqa %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <2 x i32> %a, - ret <2 x i32> %shift -} - -define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind { -; SSE-LABEL: constant_shift_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: constant_shift_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: constant_shift_v4i16: -; XOP: # %bb.0: -; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v4i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v4i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u> -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v4i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v4i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v4i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <4 x i16> %a, - ret <4 x i16> %shift -} - -define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind { -; SSE2-LABEL: constant_shift_v2i16: -; SSE2: # %bb.0: -; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v2i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psllw $3, %xmm1 -; SSE41-NEXT: psllw $2, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; SSE41-NEXT: retq -; -; AVX-LABEL: constant_shift_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $3, %xmm0, %xmm1 -; AVX-NEXT: vpsllw $2, %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: retq -; -; XOP-LABEL: constant_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v2i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsllw $3, %xmm0, %xmm1 -; AVX512DQ-NEXT: vpsllw $2, %xmm0, %xmm0 -; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v2i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u> -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v2i16: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpsllw $3, %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpsllw $2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v2i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <2 x i16> %a, - ret <2 x i16> %shift -} - -define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind { -; SSE2-LABEL: constant_shift_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: constant_shift_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: constant_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v8i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v8i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v8i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: packuswb %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <8 x i8> %a, - ret <8 x i8> %shift -} - -define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind { -; SSE2-LABEL: constant_shift_v4i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: constant_shift_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: constant_shift_v4i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v4i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v4i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v4i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: packuswb %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <4 x i8> %a, - ret <4 x i8> %shift -} - -define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind { -; SSE2-LABEL: constant_shift_v2i8: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSE41-LABEL: constant_shift_v2i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: constant_shift_v2i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: constant_shift_v2i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; XOP-LABEL: constant_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512DQ-LABEL: constant_shift_v2i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq -; -; AVX512BW-LABEL: constant_shift_v2i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQVL-LABEL: constant_shift_v2i8: -; AVX512DQVL: # %bb.0: -; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQVL-NEXT: vzeroupper -; AVX512DQVL-NEXT: retq -; -; AVX512BWVL-LABEL: constant_shift_v2i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -; -; X32-SSE-LABEL: constant_shift_v2i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: pxor %xmm1, %xmm1 -; X32-SSE-NEXT: packuswb %xmm1, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <2 x i8> %a, - ret <2 x i8> %shift -} - -; -; Uniform Constant Shifts -; - -define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v2i32: -; SSE: # %bb.0: -; SSE-NEXT: pslld $5, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v2i32: -; AVX: # %bb.0: -; AVX-NEXT: vpslld $5, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v2i32: -; XOP: # %bb.0: -; XOP-NEXT: vpslld $5, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v2i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $5, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v2i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpslld $5, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v2i32: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pslld $5, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <2 x i32> %a, - ret <2 x i32> %shift -} - -define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v4i16: -; SSE: # %bb.0: -; SSE-NEXT: psllw $3, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v4i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v4i16: -; XOP: # %bb.0: -; XOP-NEXT: vpsllw $3, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v4i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v4i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v4i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $3, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <4 x i16> %a, - ret <4 x i16> %shift -} - -define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v2i16: -; SSE: # %bb.0: -; SSE-NEXT: psllw $3, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v2i16: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v2i16: -; XOP: # %bb.0: -; XOP-NEXT: vpsllw $3, %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v2i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v2i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v2i16: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $3, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <2 x i16> %a, - ret <2 x i16> %shift -} - -define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v8i8: -; SSE: # %bb.0: -; SSE-NEXT: psllw $3, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v8i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v8i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v8i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <8 x i8> %a, - ret <8 x i8> %shift -} - -define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v4i8: -; SSE: # %bb.0: -; SSE-NEXT: psllw $3, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v4i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v4i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v4i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v4i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <4 x i8> %a, - ret <4 x i8> %shift -} - -define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { -; SSE-LABEL: splatconstant_shift_v2i8: -; SSE: # %bb.0: -; SSE-NEXT: psllw $3, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: splatconstant_shift_v2i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq -; -; XOP-LABEL: splatconstant_shift_v2i8: -; XOP: # %bb.0: -; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0 -; XOP-NEXT: retq -; -; AVX512-LABEL: splatconstant_shift_v2i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq -; -; AVX512VL-LABEL: splatconstant_shift_v2i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512VL-NEXT: retq -; -; X32-SSE-LABEL: splatconstant_shift_v2i8: -; X32-SSE: # %bb.0: -; X32-SSE-NEXT: psllw $3, %xmm0 -; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0 -; X32-SSE-NEXT: retl - %shift = shl <2 x i8> %a, - ret <2 x i8> %shift -} diff --git a/test/CodeGen/X86/vector-trunc-math-widen.ll b/test/CodeGen/X86/vector-trunc-math-widen.ll deleted file mode 100644 index 1c2813d35f0..00000000000 --- a/test/CodeGen/X86/vector-trunc-math-widen.ll +++ /dev/null @@ -1,5197 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ - -; -; add -; - -define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { -; SSE-LABEL: trunc_add_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_add_v4i64_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = add <4 x i64> %a0, %a1 - %2 = trunc <4 x i64> %1 to <4 x i32> - ret <4 x i32> %2 -} - -define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { -; SSE-LABEL: trunc_add_v8i64_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: paddq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpaddq %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_add_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = add <8 x i64> %a0, %a1 - %2 = trunc <8 x i64> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { -; SSE-LABEL: trunc_add_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: paddd %xmm2, %xmm0 -; SSE-NEXT: paddd %xmm3, %xmm1 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_add_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_add_v8i32_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = add <8 x i32> %a0, %a1 - %2 = trunc <8 x i32> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { -; SSE-LABEL: trunc_add_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] -; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 -; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpaddq %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddq %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddq %ymm7, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpaddq %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_add_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = add <16 x i64> %a0, %a1 - %2 = trunc <16 x i64> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { -; SSE-LABEL: trunc_add_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: paddd %xmm4, %xmm0 -; SSE-NEXT: paddd %xmm5, %xmm1 -; SSE-NEXT: paddd %xmm6, %xmm2 -; SSE-NEXT: paddd %xmm7, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_add_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_add_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = add <16 x i32> %a0, %a1 - %2 = trunc <16 x i32> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { -; SSE-LABEL: trunc_add_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: paddw %xmm2, %xmm0 -; SSE-NEXT: paddw %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_add_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_add_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_add_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_add_v16i16_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = add <16 x i16> %a0, %a1 - %2 = trunc <16 x i16> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) { -; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8: -; SSE: # %bb.0: -; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: psrad $16, %xmm2 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm2, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: paddw %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> - %2 = sext <8 x i8> %1 to <8 x i32> - %3 = add <8 x i32> %2, %a1 - %4 = trunc <8 x i32> %3 to <8 x i16> - ret <8 x i16> %4 -} - -; -; add to constant -; - -define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind { -; SSE-LABEL: trunc_add_const_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_const_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_add_const_v4i64_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = add <4 x i64> %a0, - %2 = trunc <4 x i64> %1 to <4 x i32> - ret <4 x i32> %2 -} - -define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { -; SSE-LABEL: trunc_add_const_v8i64_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_const_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_add_const_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = add <8 x i64> %a0, - %2 = trunc <8 x i64> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind { -; SSE-LABEL: trunc_add_const_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: paddw {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_const_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_add_const_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_add_const_v8i32_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = add <8 x i32> %a0, - %2 = trunc <8 x i32> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { -; SSE-LABEL: trunc_add_const_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_const_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_add_const_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = add <16 x i64> %a0, - %2 = trunc <16 x i64> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind { -; SSE-LABEL: trunc_add_const_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_const_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_add_const_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_add_const_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = add <16 x i32> %a0, - %2 = trunc <16 x i32> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind { -; SSE-LABEL: trunc_add_const_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: paddb {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_add_const_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_add_const_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_add_const_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = add <16 x i16> %a0, - %2 = trunc <16 x i16> %1 to <16 x i8> - ret <16 x i8> %2 -} - -; -; sub -; - -define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { -; SSE-LABEL: trunc_sub_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: psubq %xmm3, %xmm1 -; SSE-NEXT: psubq %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_sub_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_sub_v4i64_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = sub <4 x i64> %a0, %a1 - %2 = trunc <4 x i64> %1 to <4 x i32> - ret <4 x i32> %2 -} - -define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { -; SSE-LABEL: trunc_sub_v8i64_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: psubq %xmm6, %xmm2 -; SSE-NEXT: psubq %xmm7, %xmm3 -; SSE-NEXT: psubq %xmm4, %xmm0 -; SSE-NEXT: psubq %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_sub_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7] -; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7] -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpsubq %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_sub_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = sub <8 x i64> %a0, %a1 - %2 = trunc <8 x i64> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { -; SSE-LABEL: trunc_sub_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: psubd %xmm2, %xmm0 -; SSE-NEXT: psubd %xmm3, %xmm1 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_sub_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_sub_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_sub_v8i32_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = sub <8 x i32> %a0, %a1 - %2 = trunc <8 x i32> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { -; SSE-LABEL: trunc_sub_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_sub_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] -; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 -; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpsubq %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpsubq %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpsubq %ymm7, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpsubq %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_sub_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = sub <16 x i64> %a0, %a1 - %2 = trunc <16 x i64> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { -; SSE-LABEL: trunc_sub_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: psubd %xmm4, %xmm0 -; SSE-NEXT: psubd %xmm5, %xmm1 -; SSE-NEXT: psubd %xmm6, %xmm2 -; SSE-NEXT: psubd %xmm7, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_sub_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_sub_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_sub_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = sub <16 x i32> %a0, %a1 - %2 = trunc <16 x i32> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { -; SSE-LABEL: trunc_sub_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: psubw %xmm2, %xmm0 -; SSE-NEXT: psubw %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_sub_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_sub_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_sub_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_sub_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = sub <16 x i16> %a0, %a1 - %2 = trunc <16 x i16> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) { -; SSE-LABEL: trunc_ext_sub_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: psubb %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: trunc_ext_sub_v16i16_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq - %a = zext <16 x i8> %x to <16 x i16> - %b = zext <16 x i8> %y to <16 x i16> - %c = sub <16 x i16> %a, %b - %d = trunc <16 x i16> %c to <16 x i8> - ret <16 x i8> %d -} - -; -; sub to constant -; - -define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind { -; SSE-LABEL: trunc_sub_const_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: psubd {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_sub_const_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_sub_const_v4i64_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = sub <4 x i64> %a0, - %2 = trunc <4 x i64> %1 to <4 x i32> - ret <4 x i32> %2 -} - -define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { -; SSE-LABEL: trunc_sub_const_v8i64_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_sub_const_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_sub_const_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = sub <8 x i64> %a0, - %2 = trunc <8 x i64> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind { -; SSE-LABEL: trunc_sub_const_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: psubw {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_sub_const_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_sub_const_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_sub_const_v8i32_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = sub <8 x i32> %a0, - %2 = trunc <8 x i32> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { -; SSE-LABEL: trunc_sub_const_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_sub_const_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_sub_const_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = sub <16 x i64> %a0, - %2 = trunc <16 x i64> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind { -; SSE-LABEL: trunc_sub_const_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_sub_const_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_sub_const_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_sub_const_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = sub <16 x i32> %a0, - %2 = trunc <16 x i32> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind { -; SSE-LABEL: trunc_sub_const_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_sub_const_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_sub_const_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = sub <16 x i16> %a0, - %2 = trunc <16 x i16> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) { -; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: psubb {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq - %a = zext <16 x i8> %x to <16 x i16> - %b = sub <16 x i16> %a, - %c = trunc <16 x i16> %b to <16 x i8> - ret <16 x i8> %c -} - -define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) { -; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; SSE-NEXT: psubb %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] -; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0 -; AVX-NEXT: retq - %a = zext <16 x i8> %x to <16 x i16> - %b = sub <16 x i16> , %a - %c = trunc <16 x i16> %b to <16 x i8> - ret <16 x i8> %c -} - -; -; mul -; - -define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { -; SSE-LABEL: trunc_mul_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_mul_v4i64_v4i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_mul_v4i64_v4i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = mul <4 x i64> %a0, %a1 - %2 = trunc <4 x i64> %1 to <4 x i32> - ret <4 x i32> %2 -} - -define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { -; SSE-LABEL: trunc_mul_v8i64_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: pmullw %xmm6, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpmullw %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_mul_v8i64_v8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_mul_v8i64_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = mul <8 x i64> %a0, %a1 - %2 = trunc <8 x i64> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { -; SSE-LABEL: trunc_mul_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_mul_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_mul_v8i32_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = mul <8 x i32> %a0, %a1 - %2 = trunc <8 x i32> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { -; SSE-LABEL: trunc_mul_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255] -; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5 -; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3 -; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm8 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2],xmm8[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2] -; AVX2-SLOW-NEXT: vpmulld %xmm8, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2] -; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2] -; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3 -; AVX2-FAST-NEXT: vpmulld %xmm7, %xmm3, %xmm3 -; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6 -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2 -; AVX2-FAST-NEXT: vpmulld %xmm6, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1 -; AVX2-FAST-NEXT: vpmulld %xmm5, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm4 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0 -; AVX2-FAST-NEXT: vpmulld %xmm4, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_mul_v16i64_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovqd %zmm3, %ymm3 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1 -; AVX512F-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_mul_v16i64_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = mul <16 x i64> %a0, %a1 - %2 = trunc <16 x i64> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { -; SSE-LABEL: trunc_mul_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] -; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: pmuludq %xmm8, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] -; SSE-NEXT: pmuludq %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE-NEXT: pmuludq %xmm6, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE-NEXT: pmuludq %xmm7, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] -; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2 -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_mul_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_mul_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = mul <16 x i32> %a0, %a1 - %2 = trunc <16 x i32> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { -; SSE-LABEL: trunc_mul_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: pmullw %xmm2, %xmm0 -; SSE-NEXT: pmullw %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_mul_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_mul_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_mul_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = mul <16 x i16> %a0, %a1 - %2 = trunc <16 x i16> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) { -; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE-NEXT: pslld $16, %xmm2 -; SSE-NEXT: psrad $16, %xmm2 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: packssdw %xmm2, %xmm1 -; SSE-NEXT: pmullw %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> - %2 = zext <8 x i8> %1 to <8 x i32> - %3 = mul <8 x i32> %2, %a1 - %4 = trunc <8 x i32> %3 to <8 x i16> - ret <8 x i16> %4 -} - -; -; mul to constant -; - -define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { -; SSE-LABEL: trunc_mul_const_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_mul_const_v4i64_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = mul <4 x i64> %a0, - %2 = trunc <4 x i64> %1 to <4 x i32> - ret <4 x i32> %2 -} - -define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { -; SSE-LABEL: trunc_mul_const_v8i64_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_mul_const_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = mul <8 x i64> %a0, - %2 = trunc <8 x i64> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind { -; SSE-LABEL: trunc_mul_const_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_const_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_mul_const_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_mul_const_v8i32_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = mul <8 x i32> %a0, - %2 = trunc <8 x i32> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind { -; SSE-LABEL: trunc_mul_const_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 -; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2 -; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3 -; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm4 -; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm5 -; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm6 -; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_const_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 -; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm3, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 -; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm3, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255] -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7 -; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6 -; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3 -; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_mul_const_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = mul <16 x i64> %a0, - %2 = trunc <16 x i64> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind { -; SSE-LABEL: trunc_mul_const_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] -; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: pmuludq %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE-NEXT: pmuludq %xmm4, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: pmuludq %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] -; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: pmuludq %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3] -; SSE-NEXT: pmuludq %xmm4, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE-NEXT: pmuludq %xmm5, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_const_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255] -; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_mul_const_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_mul_const_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = mul <16 x i32> %a0, - %2 = trunc <16 x i32> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind { -; SSE-LABEL: trunc_mul_const_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 -; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_mul_const_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_mul_const_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = mul <16 x i16> %a0, - %2 = trunc <16 x i16> %1 to <16 x i8> - ret <16 x i8> %2 -} - -; -; and -; - -define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { -; SSE-LABEL: trunc_and_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: andps %xmm3, %xmm1 -; SSE-NEXT: andps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_and_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_and_v4i64_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = and <4 x i64> %a0, %a1 - %2 = trunc <4 x i64> %1 to <4 x i32> - ret <4 x i32> %2 -} - -define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { -; SSE-LABEL: trunc_and_v8i64_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pand %xmm6, %xmm2 -; SSE-NEXT: pand %xmm7, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: pand %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_and_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_and_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = and <8 x i64> %a0, %a1 - %2 = trunc <8 x i64> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { -; SSE-LABEL: trunc_and_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pand %xmm3, %xmm1 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_and_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_and_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_and_v8i32_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = and <8 x i32> %a0, %a1 - %2 = trunc <8 x i32> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { -; SSE-LABEL: trunc_and_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_and_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255] -; AVX1-NEXT: vandps %ymm8, %ymm7, %ymm7 -; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7 -; AVX1-NEXT: vpackusdw %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vandps %ymm8, %ymm6, %ymm6 -; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpackusdw %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %ymm8, %ymm5, %ymm3 -; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm8, %ymm4, %ymm3 -; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpand %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpand %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_and_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = and <16 x i64> %a0, %a1 - %2 = trunc <16 x i64> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { -; SSE-LABEL: trunc_and_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm3, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: pand %xmm2, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm1, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm5, %xmm0 -; SSE-NEXT: packuswb %xmm6, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_and_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_and_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_and_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = and <16 x i32> %a0, %a1 - %2 = trunc <16 x i32> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { -; SSE-LABEL: trunc_and_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm1, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm3, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_and_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_and_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_and_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_and_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_and_v16i16_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = and <16 x i16> %a0, %a1 - %2 = trunc <16 x i16> %1 to <16 x i8> - ret <16 x i8> %2 -} - -; -; and to constant -; - -define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind { -; SSE-LABEL: trunc_and_const_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: andps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_and_const_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_and_const_v4i64_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = and <4 x i64> %a0, - %2 = trunc <4 x i64> %1 to <4 x i32> - ret <4 x i32> %2 -} - -define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { -; SSE-LABEL: trunc_and_const_v8i64_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: andpd {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_and_const_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_and_const_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = and <8 x i64> %a0, - %2 = trunc <8 x i64> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind { -; SSE-LABEL: trunc_and_const_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_and_const_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_and_const_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_and_const_v8i32_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = and <8 x i32> %a0, - %2 = trunc <8 x i32> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { -; SSE-LABEL: trunc_and_const_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_and_const_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_and_const_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = and <16 x i64> %a0, - %2 = trunc <16 x i64> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind { -; SSE-LABEL: trunc_and_const_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_and_const_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_and_const_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_and_const_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = and <16 x i32> %a0, - %2 = trunc <16 x i32> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind { -; SSE-LABEL: trunc_and_const_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_and_const_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_and_const_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_and_const_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = and <16 x i16> %a0, - %2 = trunc <16 x i16> %1 to <16 x i8> - ret <16 x i8> %2 -} - -; -; xor -; - -define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { -; SSE-LABEL: trunc_xor_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: xorps %xmm3, %xmm1 -; SSE-NEXT: xorps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_xor_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_xor_v4i64_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = xor <4 x i64> %a0, %a1 - %2 = trunc <4 x i64> %1 to <4 x i32> - ret <4 x i32> %2 -} - -define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { -; SSE-LABEL: trunc_xor_v8i64_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_xor_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_xor_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = xor <8 x i64> %a0, %a1 - %2 = trunc <8 x i64> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { -; SSE-LABEL: trunc_xor_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_xor_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_xor_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_xor_v8i32_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = xor <8 x i32> %a0, %a1 - %2 = trunc <8 x i32> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { -; SSE-LABEL: trunc_xor_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_xor_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 -; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2 -; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpxor %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpxor %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpxor %ymm7, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpxor %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_xor_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = xor <16 x i64> %a0, %a1 - %2 = trunc <16 x i64> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { -; SSE-LABEL: trunc_xor_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm4, %xmm0 -; SSE-NEXT: pxor %xmm5, %xmm1 -; SSE-NEXT: pxor %xmm6, %xmm2 -; SSE-NEXT: pxor %xmm7, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_xor_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_xor_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_xor_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = xor <16 x i32> %a0, %a1 - %2 = trunc <16 x i32> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { -; SSE-LABEL: trunc_xor_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_xor_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_xor_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_xor_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_xor_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = xor <16 x i16> %a0, %a1 - %2 = trunc <16 x i16> %1 to <16 x i8> - ret <16 x i8> %2 -} - -; -; xor to constant -; - -define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind { -; SSE-LABEL: trunc_xor_const_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: xorps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_xor_const_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_xor_const_v4i64_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = xor <4 x i64> %a0, - %2 = trunc <4 x i64> %1 to <4 x i32> - ret <4 x i32> %2 -} - -define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { -; SSE-LABEL: trunc_xor_const_v8i64_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: xorpd {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_xor_const_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_xor_const_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = xor <8 x i64> %a0, - %2 = trunc <8 x i64> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind { -; SSE-LABEL: trunc_xor_const_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_xor_const_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_xor_const_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_xor_const_v8i32_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = xor <8 x i32> %a0, - %2 = trunc <8 x i32> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { -; SSE-LABEL: trunc_xor_const_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_xor_const_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = xor <16 x i64> %a0, - %2 = trunc <16 x i64> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind { -; SSE-LABEL: trunc_xor_const_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_xor_const_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_xor_const_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_xor_const_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = xor <16 x i32> %a0, - %2 = trunc <16 x i32> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind { -; SSE-LABEL: trunc_xor_const_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: pxor {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_xor_const_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_xor_const_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = xor <16 x i16> %a0, - %2 = trunc <16 x i16> %1 to <16 x i8> - ret <16 x i8> %2 -} - -; -; or -; - -define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { -; SSE-LABEL: trunc_or_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: orps %xmm3, %xmm1 -; SSE-NEXT: orps %xmm2, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_or_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_or_v4i64_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = or <4 x i64> %a0, %a1 - %2 = trunc <4 x i64> %1 to <4 x i32> - ret <4 x i32> %2 -} - -define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { -; SSE-LABEL: trunc_or_v8i64_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_or_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_or_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = or <8 x i64> %a0, %a1 - %2 = trunc <8 x i64> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind { -; SSE-LABEL: trunc_or_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_or_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_or_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_or_v8i32_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = or <8 x i32> %a0, %a1 - %2 = trunc <8 x i32> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { -; SSE-LABEL: trunc_or_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0 -; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1 -; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2 -; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3 -; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4 -; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5 -; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6 -; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_or_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1 -; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2 -; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3 -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpor %ymm7, %ymm3, %ymm3 -; AVX2-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_or_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = or <16 x i64> %a0, %a1 - %2 = trunc <16 x i64> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind { -; SSE-LABEL: trunc_or_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm4, %xmm0 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: por %xmm6, %xmm2 -; SSE-NEXT: por %xmm7, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_or_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_or_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_or_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = or <16 x i32> %a0, %a1 - %2 = trunc <16 x i32> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind { -; SSE-LABEL: trunc_or_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: por %xmm2, %xmm0 -; SSE-NEXT: por %xmm3, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_or_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_or_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_or_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_or_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_or_v16i16_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = or <16 x i16> %a0, %a1 - %2 = trunc <16 x i16> %1 to <16 x i8> - ret <16 x i8> %2 -} - -; -; or to constant -; - -define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind { -; SSE-LABEL: trunc_or_const_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: orps {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_or_const_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_or_const_v4i64_v4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = or <4 x i64> %a0, - %2 = trunc <4 x i64> %1 to <4 x i32> - ret <4 x i32> %2 -} - -define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { -; SSE-LABEL: trunc_or_const_v8i64_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE-NEXT: orpd {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_or_const_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_or_const_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = or <8 x i64> %a0, - %2 = trunc <8 x i64> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind { -; SSE-LABEL: trunc_or_const_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: pslld $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: pslld $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_or_const_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_or_const_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_or_const_v8i32_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = or <8 x i32> %a0, - %2 = trunc <8 x i32> %1 to <8 x i16> - ret <8 x i16> %2 -} - -define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { -; SSE-LABEL: trunc_or_const_v16i64_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE-NEXT: pand %xmm8, %xmm7 -; SSE-NEXT: pand %xmm8, %xmm6 -; SSE-NEXT: packuswb %xmm7, %xmm6 -; SSE-NEXT: pand %xmm8, %xmm5 -; SSE-NEXT: pand %xmm8, %xmm4 -; SSE-NEXT: packuswb %xmm5, %xmm4 -; SSE-NEXT: packuswb %xmm6, %xmm4 -; SSE-NEXT: pand %xmm8, %xmm3 -; SSE-NEXT: pand %xmm8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm8, %xmm1 -; SSE-NEXT: pand %xmm8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm4, %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_or_const_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255] -; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5 -; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2 -; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_or_const_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = or <16 x i64> %a0, - %2 = trunc <16 x i64> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind { -; SSE-LABEL: trunc_or_const_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE-NEXT: pand %xmm4, %xmm3 -; SSE-NEXT: pand %xmm4, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: pand %xmm4, %xmm1 -; SSE-NEXT: pand %xmm4, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_or_const_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_or_const_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_or_const_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = or <16 x i32> %a0, - %2 = trunc <16 x i32> %1 to <16 x i8> - ret <16 x i8> %2 -} - -define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind { -; SSE-LABEL: trunc_or_const_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_or_const_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_or_const_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_or_const_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8: -; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; AVX512DQ-NEXT: vzeroupper -; AVX512DQ-NEXT: retq - %1 = or <16 x i16> %a0, - %2 = trunc <16 x i16> %1 to <16 x i8> - ret <16 x i8> %2 -} - -; -; complex patterns - often created by vectorizer -; - -define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { -; SSE-LABEL: mul_add_const_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: mul_add_const_v4i64_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: retq - %1 = sext <4 x i32> %a0 to <4 x i64> - %2 = sext <4 x i32> %a1 to <4 x i64> - %3 = mul <4 x i64> %1, %2 - %4 = add <4 x i64> %3, - %5 = trunc <4 x i64> %4 to <4 x i32> - ret <4 x i32> %5 -} - -define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { -; SSE-LABEL: mul_add_self_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: paddd %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: mul_add_self_v4i64_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq - %1 = sext <4 x i32> %a0 to <4 x i64> - %2 = sext <4 x i32> %a1 to <4 x i64> - %3 = mul <4 x i64> %1, %2 - %4 = add <4 x i64> %3, %3 - %5 = trunc <4 x i64> %4 to <4 x i32> - ret <4 x i32> %5 -} - -define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { -; SSE-LABEL: mul_add_multiuse_v4i64_v4i32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] -; SSE-NEXT: pmuludq %xmm2, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2] -; SSE-NEXT: paddd %xmm4, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq - %1 = sext <4 x i32> %a0 to <4 x i64> - %2 = sext <4 x i32> %a1 to <4 x i64> - %3 = mul <4 x i64> %1, %2 - %4 = add <4 x i64> %1, %3 - %5 = trunc <4 x i64> %4 to <4 x i32> - ret <4 x i32> %5 -} diff --git a/test/CodeGen/X86/vector-trunc-packus-widen.ll b/test/CodeGen/X86/vector-trunc-packus-widen.ll deleted file mode 100644 index eb0a32fee08..00000000000 --- a/test/CodeGen/X86/vector-trunc-packus-widen.ll +++ /dev/null @@ -1,3079 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL - -; -; PACKUS saturation truncation to vXi32 -; - -define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { -; SSE2-LABEL: trunc_packus_v4i64_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_packus_v4i64_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_packus_v4i64_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm2 -; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_packus_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_packus_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_packus_v4i64_v4i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v4i64_v4i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v4i64_v4i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovusqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp slt <4 x i64> %a0, - %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> - %3 = icmp sgt <4 x i64> %2, zeroinitializer - %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer - %5 = trunc <4 x i64> %4 to <4 x i32> - ret <4 x i32> %5 -} - - -define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) { -; SSE2-LABEL: trunc_packus_v8i64_v8i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_packus_v8i64_v8i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647] -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm6, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm1 -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_packus_v8i64_v8i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295] -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647] -; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm5, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm8 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm5, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm5, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm5, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: xorpd %xmm2, %xmm2 -; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: xorpd %xmm10, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm3 -; SSE41-NEXT: movapd %xmm4, %xmm1 -; SSE41-NEXT: xorpd %xmm10, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; SSE41-NEXT: movapd %xmm9, %xmm3 -; SSE41-NEXT: xorpd %xmm10, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3 -; SSE41-NEXT: movapd %xmm8, %xmm4 -; SSE41-NEXT: xorpd %xmm10, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v8i64_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4294967295,4294967295] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2 -; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vpand %ymm1, %ymm3, %ymm1 -; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_packus_v8i64_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512-NEXT: retq - %1 = icmp slt <8 x i64> %a0, - %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> - %3 = icmp sgt <8 x i64> %2, zeroinitializer - %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer - %5 = trunc <8 x i64> %4 to <8 x i32> - ret <8 x i32> %5 -} - -; -; PACKUS saturation truncation to vXi16 -; - -define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) { -; SSE2-LABEL: trunc_packus_v8i64_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_packus_v8i64_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm3, %xmm6 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm9, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm6, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pxor %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_packus_v8i64_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: xorpd %xmm10, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4 -; SSE41-NEXT: movapd %xmm6, %xmm1 -; SSE41-NEXT: xorpd %xmm10, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE41-NEXT: packusdw %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: xorpd %xmm10, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm8, %xmm2 -; SSE41-NEXT: xorpd %xmm10, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: packusdw %xmm4, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2 -; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 -; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_packus_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovusqw %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp slt <8 x i64> %a0, - %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> - %3 = icmp sgt <8 x i64> %2, zeroinitializer - %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer - %5 = trunc <8 x i64> %4 to <8 x i16> - ret <8 x i16> %5 -} - -define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) { -; SSE2-LABEL: trunc_packus_v8i32_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_packus_v8i32_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm1 -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_packus_v8i32_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_packus_v8i32_v8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v8i32_v8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v8i32_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp slt <8 x i32> %a0, - %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> - %3 = icmp sgt <8 x i32> %2, zeroinitializer - %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer - %5 = trunc <8 x i32> %4 to <8 x i16> - ret <8 x i16> %5 -} - -define <16 x i16> @trunc_packus_v16i32_v16i16(<16 x i32> %a0) { -; SSE2-LABEL: trunc_packus_v16i32_v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] -; SSE2-NEXT: movdqa %xmm6, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm6, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm6, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm6, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm6, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm6, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm5, %xmm0 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm3, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_packus_v16i32_v16i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535] -; SSSE3-NEXT: movdqa %xmm6, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm6, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm6, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm6, %xmm5 -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: movdqa %xmm6, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm6, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm6, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pandn %xmm6, %xmm3 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5 -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pslld $16, %xmm5 -; SSSE3-NEXT: psrad $16, %xmm5 -; SSSE3-NEXT: pslld $16, %xmm0 -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: packssdw %xmm5, %xmm0 -; SSSE3-NEXT: pslld $16, %xmm3 -; SSSE3-NEXT: psrad $16, %xmm3 -; SSSE3-NEXT: pslld $16, %xmm1 -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: packssdw %xmm3, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_packus_v16i32_v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v16i32_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v16i32_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_packus_v16i32_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovusdw %zmm0, %ymm0 -; AVX512-NEXT: retq - %1 = icmp slt <16 x i32> %a0, - %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> - %3 = icmp sgt <16 x i32> %2, zeroinitializer - %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer - %5 = trunc <16 x i32> %4 to <16 x i16> - ret <16 x i16> %5 -} - -; -; PACKUS saturation truncation to v16i8 -; - -define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) { -; SSE2-LABEL: trunc_packus_v8i64_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_packus_v8i64_v8i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm9, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSSE3-NEXT: packuswb %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_packus_v8i64_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: xorpd %xmm10, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4 -; SSE41-NEXT: movapd %xmm6, %xmm1 -; SSE41-NEXT: xorpd %xmm10, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE41-NEXT: packusdw %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: xorpd %xmm10, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: movapd %xmm8, %xmm2 -; SSE41-NEXT: xorpd %xmm10, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: packusdw %xmm4, %xmm3 -; SSE41-NEXT: packusdw %xmm3, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v8i64_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2 -; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v8i64_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_packus_v8i64_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp slt <8 x i64> %a0, - %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> - %3 = icmp sgt <8 x i64> %2, zeroinitializer - %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer - %5 = trunc <8 x i64> %4 to <8 x i8> - ret <8 x i8> %5 -} - -define void @trunc_packus_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { -; SSE2-LABEL: trunc_packus_v8i64_v8i8_store: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm10, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm10, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm10, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm10, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_packus_v8i64_v8i8_store: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm10, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm9, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm10, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm10, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm10, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm10, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSSE3-NEXT: packuswb %xmm0, %xmm0 -; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_packus_v8i64_v8i8_store: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm7 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm9, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: movapd %xmm7, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm10, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7 -; SSE41-NEXT: xorpd %xmm1, %xmm1 -; SSE41-NEXT: movapd %xmm7, %xmm4 -; SSE41-NEXT: xorpd %xmm10, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm5 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: xorpd %xmm10, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4 -; SSE41-NEXT: packusdw %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm2, %xmm3 -; SSE41-NEXT: xorpd %xmm10, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: movapd %xmm8, %xmm2 -; SSE41-NEXT: xorpd %xmm10, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm10, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm10, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: packusdw %xmm3, %xmm1 -; SSE41-NEXT: packusdw %xmm1, %xmm4 -; SSE41-NEXT: packuswb %xmm0, %xmm4 -; SSE41-NEXT: movq %xmm4, (%rdi) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v8i64_v8i8_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2 -; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v8i64_v8i8_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_packus_v8i64_v8i8_store: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovusqb %zmm0, (%rdi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp slt <8 x i64> %a0, - %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> - %3 = icmp sgt <8 x i64> %2, zeroinitializer - %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer - %5 = trunc <8 x i64> %4 to <8 x i8> - store <8 x i8> %5, <8 x i8> *%p1 - ret void -} - -define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) { -; SSE2-LABEL: trunc_packus_v16i64_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm6, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm14, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: pandn %xmm10, %xmm9 -; SSE2-NEXT: por %xmm6, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm7 -; SSE2-NEXT: pandn %xmm10, %xmm12 -; SSE2-NEXT: por %xmm7, %xmm12 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm13 -; SSE2-NEXT: pand %xmm13, %xmm4 -; SSE2-NEXT: pandn %xmm10, %xmm13 -; SSE2-NEXT: por %xmm4, %xmm13 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm11, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm14 -; SSE2-NEXT: pand %xmm14, %xmm5 -; SSE2-NEXT: pandn %xmm10, %xmm14 -; SSE2-NEXT: por %xmm5, %xmm14 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm11, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm10, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm8, %xmm3 -; SSE2-NEXT: movdqa %xmm11, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm10, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm11, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm10, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm14, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm14, %xmm2 -; SSE2-NEXT: movdqa %xmm13, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm12, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm12, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm8, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm9, %xmm4 -; SSE2-NEXT: packuswb %xmm3, %xmm4 -; SSE2-NEXT: packuswb %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_packus_v16i64_v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pxor %xmm8, %xmm9 -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm11, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] -; SSSE3-NEXT: por %xmm14, %xmm9 -; SSSE3-NEXT: pand %xmm9, %xmm6 -; SSSE3-NEXT: pandn %xmm10, %xmm9 -; SSSE3-NEXT: por %xmm6, %xmm9 -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pxor %xmm8, %xmm6 -; SSSE3-NEXT: movdqa %xmm11, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm12 -; SSSE3-NEXT: pand %xmm12, %xmm7 -; SSSE3-NEXT: pandn %xmm10, %xmm12 -; SSSE3-NEXT: por %xmm7, %xmm12 -; SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm8, %xmm6 -; SSSE3-NEXT: movdqa %xmm11, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm13 -; SSSE3-NEXT: pand %xmm13, %xmm4 -; SSSE3-NEXT: pandn %xmm10, %xmm13 -; SSSE3-NEXT: por %xmm4, %xmm13 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: movdqa %xmm11, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm14 -; SSSE3-NEXT: pand %xmm14, %xmm5 -; SSSE3-NEXT: pandn %xmm10, %xmm14 -; SSSE3-NEXT: por %xmm5, %xmm14 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: movdqa %xmm11, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm10, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm11, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm10, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm8, %xmm3 -; SSSE3-NEXT: movdqa %xmm11, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm10, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm8, %xmm0 -; SSSE3-NEXT: movdqa %xmm11, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm10, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm8, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm8, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm14, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm14, %xmm2 -; SSSE3-NEXT: movdqa %xmm13, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm13, %xmm1 -; SSSE3-NEXT: packuswb %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm12, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm12, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pand %xmm9, %xmm4 -; SSSE3-NEXT: packuswb %xmm3, %xmm4 -; SSSE3-NEXT: packuswb %xmm4, %xmm1 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_packus_v16i64_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm12 = [2147483903,2147483903] -; SSE41-NEXT: movdqa %xmm12, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: movdqa %xmm12, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm10 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 -; SSE41-NEXT: movdqa %xmm12, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm13, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm13 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm13 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: movdqa %xmm12, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm14 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm14 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm12, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm15 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm15 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm12, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm12, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm12, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm12, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: movapd %xmm11, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm7 -; SSE41-NEXT: movapd %xmm3, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1 -; SSE41-NEXT: packusdw %xmm7, %xmm1 -; SSE41-NEXT: movapd %xmm6, %xmm3 -; SSE41-NEXT: xorpd %xmm9, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm9, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: packusdw %xmm3, %xmm4 -; SSE41-NEXT: packusdw %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm15, %xmm3 -; SSE41-NEXT: xorpd %xmm9, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm4 -; SSE41-NEXT: movapd %xmm14, %xmm3 -; SSE41-NEXT: xorpd %xmm9, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm3 -; SSE41-NEXT: packusdw %xmm4, %xmm3 -; SSE41-NEXT: movapd %xmm13, %xmm4 -; SSE41-NEXT: xorpd %xmm9, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm4 -; SSE41-NEXT: movapd %xmm10, %xmm5 -; SSE41-NEXT: xorpd %xmm9, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm9, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 -; SSE41-NEXT: packusdw %xmm4, %xmm2 -; SSE41-NEXT: packusdw %xmm2, %xmm3 -; SSE41-NEXT: packuswb %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm5, %xmm10 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm11 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm7, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm5, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm9, %xmm5, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm0 -; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm5, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm8, %xmm5, %xmm3 -; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm4 -; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm4 -; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm3 -; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm4 -; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm2 -; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm3 -; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm11, %xmm2 -; AVX1-NEXT: vpand %xmm11, %xmm2, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm10, %xmm3 -; AVX1-NEXT: vpand %xmm10, %xmm3, %xmm3 -; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255] -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vpand %ymm1, %ymm5, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vpand %ymm0, %ymm5, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1 -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3 -; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_packus_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp slt <16 x i64> %a0, - %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> - %3 = icmp sgt <16 x i64> %2, zeroinitializer - %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> zeroinitializer - %5 = trunc <16 x i64> %4 to <16 x i8> - ret <16 x i8> %5 -} - -define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) { -; SSE-LABEL: trunc_packus_v8i32_v8i8: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v8i32_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v8i32_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_packus_v8i32_v8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v8i32_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v8i32_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp slt <8 x i32> %a0, - %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> - %3 = icmp sgt <8 x i32> %2, zeroinitializer - %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer - %5 = trunc <8 x i32> %4 to <8 x i8> - ret <8 x i8> %5 -} - -define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { -; SSE-LABEL: trunc_packus_v8i32_v8i8_store: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 -; SSE-NEXT: movq %xmm0, (%rdi) -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v8i32_v8i8_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v8i32_v8i8_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_packus_v8i32_v8i8_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v8i32_v8i8_store: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v8i32_v8i8_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8_store: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp slt <8 x i32> %a0, - %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> - %3 = icmp sgt <8 x i32> %2, zeroinitializer - %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer - %5 = trunc <8 x i32> %4 to <8 x i8> - store <8 x i8> %5, <8 x i8> *%p1 - ret void -} - -define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32> %a0) { -; SSE-LABEL: trunc_packus_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_packus_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp slt <16 x i32> %a0, - %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> - %3 = icmp sgt <16 x i32> %2, zeroinitializer - %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer - %5 = trunc <16 x i32> %4 to <16 x i8> - ret <16 x i8> %5 -} - -define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) { -; SSE-LABEL: trunc_packus_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_packus_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v16i16_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp slt <16 x i16> %a0, - %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> - %3 = icmp sgt <16 x i16> %2, zeroinitializer - %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer - %5 = trunc <16 x i16> %4 to <16 x i8> - ret <16 x i8> %5 -} - -define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16> %a0) { -; SSE-LABEL: trunc_packus_v32i16_v32i8: -; SSE: # %bb.0: -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v32i16_v32i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_packus_v32i16_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_packus_v32i16_v32i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpminsw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v32i16_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpminsw %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpminsw %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_packus_v32i16_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 -; AVX512BWVL-NEXT: retq - %1 = icmp slt <32 x i16> %a0, - %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> - %3 = icmp sgt <32 x i16> %2, zeroinitializer - %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer - %5 = trunc <32 x i16> %4 to <32 x i8> - ret <32 x i8> %5 -} diff --git a/test/CodeGen/X86/vector-trunc-ssat-widen.ll b/test/CodeGen/X86/vector-trunc-ssat-widen.ll deleted file mode 100644 index 4d32267b61e..00000000000 --- a/test/CodeGen/X86/vector-trunc-ssat-widen.ll +++ /dev/null @@ -1,3050 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL - -; -; Signed saturation truncation to vXi32 -; - -define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { -; SSE2-LABEL: trunc_ssat_v4i64_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pandn %xmm1, %xmm6 -; SSE2-NEXT: por %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm5, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_ssat_v4i64_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320] -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pandn %xmm1, %xmm6 -; SSSE3-NEXT: por %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm1, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_ssat_v4i64_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [2147483647,2147483647] -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295] -; SSE41-NEXT: movdqa %xmm6, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm3, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4 -; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968] -; SSE41-NEXT: movapd %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm3, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320] -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: xorpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm6, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647] -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5 -; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm2 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_ssat_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_ssat_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 -; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2 -; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_ssat_v4i64_v4i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v4i64_v4i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsqd %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v4i64_v4i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v4i64_v4i32: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp slt <4 x i64> %a0, - %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> - %3 = icmp sgt <4 x i64> %2, - %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> - %5 = trunc <4 x i64> %4 to <4 x i32> - ret <4 x i32> %5 -} - - -define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64> %a0) { -; SSE2-LABEL: trunc_ssat_v8i64_v8i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm7 -; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_ssat_v8i64_v8i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa %xmm9, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm7 -; SSSE3-NEXT: por %xmm2, %xmm7 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm6 -; SSSE3-NEXT: por %xmm3, %xmm6 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968] -; SSSE3-NEXT: movdqa %xmm6, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm7, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm7, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_ssat_v8i64_v8i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm9 -; SSE41-NEXT: movapd {{.*#+}} xmm10 = [2147483647,2147483647] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295] -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm10, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm8 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm10, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm10, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968] -; SSE41-NEXT: movapd %xmm10, %xmm1 -; SSE41-NEXT: xorpd %xmm5, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [18446744069414584320,18446744069414584320] -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm4 -; SSE41-NEXT: movapd %xmm6, %xmm1 -; SSE41-NEXT: xorpd %xmm5, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; SSE41-NEXT: movapd %xmm9, %xmm4 -; SSE41-NEXT: xorpd %xmm5, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4 -; SSE41-NEXT: xorpd %xmm8, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2 -; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; SSE41-NEXT: movaps %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v8i64_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483647,2147483647] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_ssat_v8i64_v8i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_ssat_v8i64_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsqd %zmm0, %ymm0 -; AVX512-NEXT: retq - %1 = icmp slt <8 x i64> %a0, - %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> - %3 = icmp sgt <8 x i64> %2, - %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> - %5 = trunc <8 x i64> %4 to <8 x i32> - ret <8 x i32> %5 -} - -; -; Signed saturation truncation to vXi16 -; - -define <8 x i16> @trunc_ssat_v8i64_v8i16(<8 x i64> %a0) { -; SSE2-LABEL: trunc_ssat_v8i64_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147516415,2147516415] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm7 -; SSE2-NEXT: por %xmm1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: packssdw %xmm3, %xmm1 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_ssat_v8i64_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147516415,2147516415] -; SSSE3-NEXT: movdqa %xmm9, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm7 -; SSSE3-NEXT: por %xmm1, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848] -; SSSE3-NEXT: movdqa %xmm7, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm7, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: packssdw %xmm3, %xmm1 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_ssat_v8i64_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm10 -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [32767,32767] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147516415,2147516415] -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 -; SSE41-NEXT: movdqa %xmm10, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11 -; SSE41-NEXT: movapd {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848] -; SSE41-NEXT: movapd %xmm11, %xmm1 -; SSE41-NEXT: xorpd %xmm5, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200] -; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm6 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: xorpd %xmm5, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: packssdw %xmm6, %xmm1 -; SSE41-NEXT: movapd %xmm9, %xmm2 -; SSE41-NEXT: xorpd %xmm5, %xmm2 -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm3, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2 -; SSE41-NEXT: xorpd %xmm8, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3 -; SSE41-NEXT: packssdw %xmm2, %xmm3 -; SSE41-NEXT: packssdw %xmm3, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5 -; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [32767,32767,32767,32767] -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_ssat_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsqw %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp slt <8 x i64> %a0, - %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> - %3 = icmp sgt <8 x i64> %2, - %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> - %5 = trunc <8 x i64> %4 to <8 x i16> - ret <8 x i16> %5 -} - -define <8 x i16> @trunc_ssat_v8i32_v8i16(<8 x i32> %a0) { -; SSE-LABEL: trunc_ssat_v8i32_v8i16: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_ssat_v8i32_v8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v8i32_v8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v8i32_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528] -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp slt <8 x i32> %a0, - %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> - %3 = icmp sgt <8 x i32> %2, - %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> - %5 = trunc <8 x i32> %4 to <8 x i16> - ret <8 x i16> %5 -} - -define <16 x i16> @trunc_ssat_v16i32_v16i16(<16 x i32> %a0) { -; SSE-LABEL: trunc_ssat_v16i32_v16i16: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v16i32_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v16i32_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_ssat_v16i32_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsdw %zmm0, %ymm0 -; AVX512-NEXT: retq - %1 = icmp slt <16 x i32> %a0, - %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> - %3 = icmp sgt <16 x i32> %2, - %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> - %5 = trunc <16 x i32> %4 to <16 x i16> - ret <16 x i16> %5 -} - -; -; Signed saturation truncation to v16i8 -; - -define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64> %a0) { -; SSE2-LABEL: trunc_ssat_v8i64_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm7 -; SSE2-NEXT: por %xmm0, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm7 -; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: packuswb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm7, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_ssat_v8i64_v8i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm9, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm7 -; SSSE3-NEXT: por %xmm0, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSSE3-NEXT: movdqa %xmm7, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm7 -; SSSE3-NEXT: por %xmm2, %xmm7 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: packuswb %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm3, %xmm7 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: packuswb %xmm7, %xmm0 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSSE3-NEXT: packuswb %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_ssat_v8i64_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [127,127] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm10 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm11, %xmm1 -; SSE41-NEXT: xorpd %xmm5, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movapd %xmm1, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1 -; SSE41-NEXT: movapd %xmm3, %xmm6 -; SSE41-NEXT: xorpd %xmm5, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movapd %xmm10, %xmm3 -; SSE41-NEXT: xorpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 -; SSE41-NEXT: xorpd %xmm9, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2 -; SSE41-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE41-NEXT: andpd %xmm0, %xmm2 -; SSE41-NEXT: andpd %xmm0, %xmm3 -; SSE41-NEXT: packusdw %xmm2, %xmm3 -; SSE41-NEXT: andpd %xmm0, %xmm7 -; SSE41-NEXT: andpd %xmm0, %xmm1 -; SSE41-NEXT: packusdw %xmm7, %xmm1 -; SSE41-NEXT: packusdw %xmm3, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v8i64_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [127,127,127,127] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 -; AVX1-NEXT: vblendvpd %ymm7, %ymm1, %ymm8, %ymm9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm10 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm11 -; AVX1-NEXT: vblendvpd %ymm11, %ymm0, %ymm8, %ymm8 -; AVX1-NEXT: vmovapd {{.*#+}} ymm11 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm11, %ymm0 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm1, %ymm9, %ymm11, %ymm1 -; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255] -; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v8i64_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_ssat_v8i64_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp slt <8 x i64> %a0, - %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> - %3 = icmp sgt <8 x i64> %2, - %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> - %5 = trunc <8 x i64> %4 to <8 x i8> - ret <8 x i8> %5 -} - -; TODO: The AVX1 codegen shows a missed opportunity to narrow blendv+logic to 128-bit. - -define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { -; SSE2-LABEL: trunc_ssat_v8i64_v8i8_store: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm7 -; SSE2-NEXT: por %xmm0, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm7 -; SSE2-NEXT: por %xmm2, %xmm7 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: packuswb %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: packuswb %xmm7, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_ssat_v8i64_v8i8_store: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm4, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm9, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm3, %xmm5 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm7 -; SSSE3-NEXT: por %xmm0, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488] -; SSSE3-NEXT: movdqa %xmm7, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm7 -; SSSE3-NEXT: por %xmm2, %xmm7 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: packuswb %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm3, %xmm7 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: packuswb %xmm7, %xmm0 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSSE3-NEXT: packuswb %xmm0, %xmm0 -; SSSE3-NEXT: movq %xmm0, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_ssat_v8i64_v8i8_store: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [127,127] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm7 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm9 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm10 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm6, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11 -; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm11, %xmm2 -; SSE41-NEXT: xorpd %xmm5, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movapd %xmm2, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm2 -; SSE41-NEXT: movapd %xmm3, %xmm6 -; SSE41-NEXT: xorpd %xmm5, %xmm6 -; SSE41-NEXT: movapd %xmm6, %xmm7 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm7, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7 -; SSE41-NEXT: movapd %xmm10, %xmm3 -; SSE41-NEXT: xorpd %xmm5, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3 -; SSE41-NEXT: xorpd %xmm9, %xmm5 -; SSE41-NEXT: movapd %xmm5, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm5, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1 -; SSE41-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE41-NEXT: andpd %xmm0, %xmm1 -; SSE41-NEXT: andpd %xmm0, %xmm3 -; SSE41-NEXT: packusdw %xmm1, %xmm3 -; SSE41-NEXT: andpd %xmm0, %xmm7 -; SSE41-NEXT: andpd %xmm0, %xmm2 -; SSE41-NEXT: packusdw %xmm7, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: packuswb %xmm0, %xmm2 -; SSE41-NEXT: movq %xmm2, (%rdi) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v8i64_v8i8_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [127,127,127,127] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6 -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7 -; AVX1-NEXT: vblendvpd %ymm7, %ymm1, %ymm8, %ymm9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm10 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm11 -; AVX1-NEXT: vblendvpd %ymm11, %ymm0, %ymm8, %ymm8 -; AVX1-NEXT: vmovapd {{.*#+}} ymm11 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm11, %ymm0 -; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vblendvpd %ymm1, %ymm9, %ymm11, %ymm1 -; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255] -; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v8i64_v8i8_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_ssat_v8i64_v8i8_store: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsqb %zmm0, (%rdi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp slt <8 x i64> %a0, - %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> - %3 = icmp sgt <8 x i64> %2, - %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> - %5 = trunc <8 x i64> %4 to <8 x i8> - store <8 x i8> %5, <8 x i8> *%p1 - ret void -} - -define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) { -; SSE2-LABEL: trunc_ssat_v16i64_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [127,127] -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm6, %xmm9 -; SSE2-NEXT: pxor %xmm8, %xmm9 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775] -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm14, %xmm9 -; SSE2-NEXT: pand %xmm9, %xmm6 -; SSE2-NEXT: pandn %xmm10, %xmm9 -; SSE2-NEXT: por %xmm6, %xmm9 -; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm11, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm7 -; SSE2-NEXT: pandn %xmm10, %xmm12 -; SSE2-NEXT: por %xmm7, %xmm12 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: movdqa %xmm11, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm13 -; SSE2-NEXT: pand %xmm13, %xmm4 -; SSE2-NEXT: pandn %xmm10, %xmm13 -; SSE2-NEXT: por %xmm4, %xmm13 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm11, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm14 -; SSE2-NEXT: pand %xmm14, %xmm5 -; SSE2-NEXT: pandn %xmm10, %xmm14 -; SSE2-NEXT: por %xmm5, %xmm14 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm11, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm10, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm6 -; SSE2-NEXT: por %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm8, %xmm2 -; SSE2-NEXT: movdqa %xmm11, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm0 -; SSE2-NEXT: pandn %xmm10, %xmm3 -; SSE2-NEXT: por %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm11, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm10, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744073709551488,18446744073709551488] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [18446744071562067840,18446744071562067840] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm10, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm3 -; SSE2-NEXT: pandn %xmm10, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm6 -; SSE2-NEXT: pandn %xmm10, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pandn %xmm10, %xmm3 -; SSE2-NEXT: por %xmm5, %xmm3 -; SSE2-NEXT: packssdw %xmm2, %xmm3 -; SSE2-NEXT: packssdw %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm14, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm14 -; SSE2-NEXT: pandn %xmm10, %xmm2 -; SSE2-NEXT: por %xmm14, %xmm2 -; SSE2-NEXT: movdqa %xmm13, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm13 -; SSE2-NEXT: pandn %xmm10, %xmm3 -; SSE2-NEXT: por %xmm13, %xmm3 -; SSE2-NEXT: packssdw %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm12 -; SSE2-NEXT: pandn %xmm10, %xmm2 -; SSE2-NEXT: por %xmm12, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm8 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm11, %xmm8 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm9 -; SSE2-NEXT: pandn %xmm10, %xmm1 -; SSE2-NEXT: por %xmm9, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 -; SSE2-NEXT: packssdw %xmm1, %xmm3 -; SSE2-NEXT: packsswb %xmm3, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_ssat_v16i64_v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [127,127] -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm6, %xmm9 -; SSSE3-NEXT: pxor %xmm8, %xmm9 -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775] -; SSSE3-NEXT: movdqa %xmm11, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3] -; SSSE3-NEXT: por %xmm14, %xmm9 -; SSSE3-NEXT: pand %xmm9, %xmm6 -; SSSE3-NEXT: pandn %xmm10, %xmm9 -; SSSE3-NEXT: por %xmm6, %xmm9 -; SSSE3-NEXT: movdqa %xmm7, %xmm6 -; SSSE3-NEXT: pxor %xmm8, %xmm6 -; SSSE3-NEXT: movdqa %xmm11, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm12 -; SSSE3-NEXT: pand %xmm12, %xmm7 -; SSSE3-NEXT: pandn %xmm10, %xmm12 -; SSSE3-NEXT: por %xmm7, %xmm12 -; SSSE3-NEXT: movdqa %xmm4, %xmm6 -; SSSE3-NEXT: pxor %xmm8, %xmm6 -; SSSE3-NEXT: movdqa %xmm11, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm13 -; SSSE3-NEXT: pand %xmm13, %xmm4 -; SSSE3-NEXT: pandn %xmm10, %xmm13 -; SSSE3-NEXT: por %xmm4, %xmm13 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: movdqa %xmm11, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm14 -; SSSE3-NEXT: pand %xmm14, %xmm5 -; SSSE3-NEXT: pandn %xmm10, %xmm14 -; SSSE3-NEXT: por %xmm5, %xmm14 -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: pxor %xmm8, %xmm4 -; SSSE3-NEXT: movdqa %xmm11, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm10, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm11, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pandn %xmm10, %xmm6 -; SSSE3-NEXT: por %xmm3, %xmm6 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pxor %xmm8, %xmm2 -; SSSE3-NEXT: movdqa %xmm11, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm0 -; SSSE3-NEXT: pandn %xmm10, %xmm3 -; SSSE3-NEXT: por %xmm0, %xmm3 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm8, %xmm0 -; SSSE3-NEXT: movdqa %xmm11, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm10, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744073709551488,18446744073709551488] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm8, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [18446744071562067840,18446744071562067840] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: pandn %xmm10, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm8, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm3 -; SSSE3-NEXT: pandn %xmm10, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm0 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm6, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: pandn %xmm10, %xmm2 -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pandn %xmm10, %xmm3 -; SSSE3-NEXT: por %xmm5, %xmm3 -; SSSE3-NEXT: packssdw %xmm2, %xmm3 -; SSSE3-NEXT: packssdw %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm14, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm14 -; SSSE3-NEXT: pandn %xmm10, %xmm2 -; SSSE3-NEXT: por %xmm14, %xmm2 -; SSSE3-NEXT: movdqa %xmm13, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm13 -; SSSE3-NEXT: pandn %xmm10, %xmm3 -; SSSE3-NEXT: por %xmm13, %xmm3 -; SSSE3-NEXT: packssdw %xmm2, %xmm3 -; SSSE3-NEXT: movdqa %xmm12, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm12 -; SSSE3-NEXT: pandn %xmm10, %xmm2 -; SSSE3-NEXT: por %xmm12, %xmm2 -; SSSE3-NEXT: pxor %xmm9, %xmm8 -; SSSE3-NEXT: movdqa %xmm8, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm11, %xmm8 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm9 -; SSSE3-NEXT: pandn %xmm10, %xmm1 -; SSSE3-NEXT: por %xmm9, %xmm1 -; SSSE3-NEXT: packssdw %xmm2, %xmm1 -; SSSE3-NEXT: packssdw %xmm1, %xmm3 -; SSSE3-NEXT: packsswb %xmm3, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_ssat_v16i64_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movapd {{.*#+}} xmm11 = [127,127] -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648] -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm12 = [2147483775,2147483775] -; SSE41-NEXT: movdqa %xmm12, %xmm10 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm10 -; SSE41-NEXT: movdqa %xmm12, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm10, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm10 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm10 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 -; SSE41-NEXT: movdqa %xmm12, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm13, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm13 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm13 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: movdqa %xmm12, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm14 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm14 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm4 -; SSE41-NEXT: movdqa %xmm12, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm15 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm15 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm12, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm12, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm12, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm11, %xmm7 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm9, %xmm0 -; SSE41-NEXT: movdqa %xmm12, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm12 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm12, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11 -; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488] -; SSE41-NEXT: movapd %xmm11, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840] -; SSE41-NEXT: movapd %xmm1, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4 -; SSE41-NEXT: movapd %xmm7, %xmm1 -; SSE41-NEXT: xorpd %xmm9, %xmm1 -; SSE41-NEXT: movapd %xmm1, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: packssdw %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm6, %xmm3 -; SSE41-NEXT: xorpd %xmm9, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3 -; SSE41-NEXT: movapd %xmm5, %xmm4 -; SSE41-NEXT: xorpd %xmm9, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4 -; SSE41-NEXT: packssdw %xmm3, %xmm4 -; SSE41-NEXT: packssdw %xmm4, %xmm1 -; SSE41-NEXT: movapd %xmm15, %xmm3 -; SSE41-NEXT: xorpd %xmm9, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm4 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm4 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm3 -; SSE41-NEXT: movapd %xmm14, %xmm4 -; SSE41-NEXT: xorpd %xmm9, %xmm4 -; SSE41-NEXT: movapd %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm4 -; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm4 -; SSE41-NEXT: packssdw %xmm3, %xmm4 -; SSE41-NEXT: movapd %xmm13, %xmm3 -; SSE41-NEXT: xorpd %xmm9, %xmm3 -; SSE41-NEXT: movapd %xmm3, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm2, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm3 -; SSE41-NEXT: xorpd %xmm10, %xmm9 -; SSE41-NEXT: movapd %xmm9, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm8, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm8, %xmm9 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm9, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2 -; SSE41-NEXT: packssdw %xmm3, %xmm2 -; SSE41-NEXT: packssdw %xmm2, %xmm4 -; SSE41-NEXT: packsswb %xmm4, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm5, %xmm10 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm11 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm6 -; AVX1-NEXT: vblendvpd %xmm6, %xmm7, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm9, %xmm5, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm9, %xmm5, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm0 -; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm5, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm8, %xmm5, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488] -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm5, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm4 -; AVX1-NEXT: vblendvpd %xmm4, %xmm7, %xmm5, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm11, %xmm7 -; AVX1-NEXT: vblendvpd %xmm7, %xmm11, %xmm5, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm10, %xmm3 -; AVX1-NEXT: vblendvpd %xmm3, %xmm10, %xmm5, %xmm3 -; AVX1-NEXT: vpackssdw %xmm8, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm7, %xmm3, %xmm2 -; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [127,127,127,127] -; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1 -; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_ssat_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127] -; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp slt <16 x i64> %a0, - %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> - %3 = icmp sgt <16 x i64> %2, - %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> - %5 = trunc <16 x i64> %4 to <16 x i8> - ret <16 x i8> %5 -} - -define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) { -; SSE-LABEL: trunc_ssat_v8i32_v8i8: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v8i32_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v8i32_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_ssat_v8i32_v8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp slt <8 x i32> %a0, - %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> - %3 = icmp sgt <8 x i32> %2, - %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> - %5 = trunc <8 x i32> %4 to <8 x i8> - ret <8 x i8> %5 -} - -define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { -; SSE-LABEL: trunc_ssat_v8i32_v8i8_store: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm0, %xmm0 -; SSE-NEXT: movq %xmm0, (%rdi) -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v8i32_v8i8_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v8i32_v8i8_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_ssat_v8i32_v8i8_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8_store: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsdb %ymm0, (%rdi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127] -; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168] -; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8_store: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsdb %ymm0, (%rdi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp slt <8 x i32> %a0, - %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> - %3 = icmp sgt <8 x i32> %2, - %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> - %5 = trunc <8 x i32> %4 to <8 x i8> - store <8 x i8> %5, <8 x i8> *%p1 - ret void -} - -define <16 x i8> @trunc_ssat_v16i32_v16i8(<16 x i32> %a0) { -; SSE-LABEL: trunc_ssat_v16i32_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_ssat_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovsdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp slt <16 x i32> %a0, - %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> - %3 = icmp sgt <16 x i32> %2, - %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> - %5 = trunc <16 x i32> %4 to <16 x i8> - ret <16 x i8> %5 -} - -define <16 x i8> @trunc_ssat_v16i16_v16i8(<16 x i16> %a0) { -; SSE-LABEL: trunc_ssat_v16i16_v16i8: -; SSE: # %bb.0: -; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_ssat_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v16i16_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512VL-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v16i16_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovswb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp slt <16 x i16> %a0, - %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> - %3 = icmp sgt <16 x i16> %2, - %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> - %5 = trunc <16 x i16> %4 to <16 x i8> - ret <16 x i8> %5 -} - -define <32 x i8> @trunc_ssat_v32i16_v32i8(<32 x i16> %a0) { -; SSE-LABEL: trunc_ssat_v32i16_v32i8: -; SSE: # %bb.0: -; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: packsswb %xmm3, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v32i16_v32i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_ssat_v32i16_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_ssat_v32i16_v32i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpminsw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v32i16_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-NEXT: vpminsw %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpminsw %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408] -; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_ssat_v32i16_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovswb %zmm0, %ymm0 -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_ssat_v32i16_v32i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovswb %zmm0, %ymm0 -; AVX512BWVL-NEXT: retq - %1 = icmp slt <32 x i16> %a0, - %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> - %3 = icmp sgt <32 x i16> %2, - %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> - %5 = trunc <32 x i16> %4 to <32 x i8> - ret <32 x i8> %5 -} diff --git a/test/CodeGen/X86/vector-trunc-usat-widen.ll b/test/CodeGen/X86/vector-trunc-usat-widen.ll deleted file mode 100644 index fd76cb53c6e..00000000000 --- a/test/CodeGen/X86/vector-trunc-usat-widen.ll +++ /dev/null @@ -1,2430 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL - -; -; Unsigned saturation truncation to vXi32 -; - -define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) { -; SSE2-LABEL: trunc_usat_v4i64_v4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn {{.*}}(%rip), %xmm5 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v4i64_v4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm2, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] -; SSSE3-NEXT: movdqa %xmm4, %xmm5 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm5 -; SSSE3-NEXT: pxor %xmm1, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm3 -; SSSE3-NEXT: por %xmm1, %xmm3 -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm5 -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v4i64_v4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm3, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm3 -; SSE41-NEXT: por %xmm6, %xmm3 -; SSE41-NEXT: pxor %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295] -; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,429496729] -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4 -; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2] -; SSE41-NEXT: movaps %xmm4, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v4i64_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103] -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vpxor %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,429496729] -; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm3, %xmm1 -; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,4294967295] -; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_usat_v4i64_v4i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-SLOW-NEXT: vpxor %ymm1, %ymm0, %ymm1 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] -; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_usat_v4i64_v4i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-NEXT: vpxor %ymm1, %ymm0, %ymm1 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729] -; AVX2-FAST-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_usat_v4i64_v4i32: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512F-NEXT: vpcmpltuq %zmm1, %zmm0, %k1 -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vpmovqd %zmm1, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_usat_v4i64_v4i32: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpcmpltuq {{.*}}(%rip){1to4}, %ymm0, %k1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] -; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vpmovqd %ymm1, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_usat_v4i64_v4i32: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512BW-NEXT: vpcmpltuq %zmm1, %zmm0, %k1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_usat_v4i64_v4i32: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpcmpltuq {{.*}}(%rip){1to4}, %ymm0, %k1 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729] -; AVX512BWVL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1} -; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp ult <4 x i64> %a0, - %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> - %3 = trunc <4 x i64> %2 to <4 x i32> - ret <4 x i32> %3 -} - -define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64> %a0) { -; SSE2-LABEL: trunc_usat_v8i64_v8i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v8i64_v8i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295] -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm3, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455] -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm5, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pxor %xmm0, %xmm5 -; SSSE3-NEXT: movdqa %xmm9, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm2, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] -; SSSE3-NEXT: movaps %xmm3, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v8i64_v8i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295] -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455] -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm6 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm6, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm6 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm4, %xmm7 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm7, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm3 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3 -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2 -; SSE41-NEXT: pxor %xmm8, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm5, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm5, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9 -; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2] -; SSE41-NEXT: movaps %xmm9, %xmm0 -; SSE41-NEXT: movaps %xmm3, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v8i64_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4294967295,4294967295] -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_usat_v8i64_v8i32: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm4 -; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_usat_v8i64_v8i32: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm4 -; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] -; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpxor %ymm3, %ymm0, %ymm3 -; AVX2-FAST-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc_usat_v8i64_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovusqd %zmm0, %ymm0 -; AVX512-NEXT: retq - %1 = icmp ult <8 x i64> %a0, - %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> - %3 = trunc <8 x i64> %2 to <8 x i32> - ret <8 x i32> %3 -} - -; -; Unsigned saturation truncation to vXi16 -; - -define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64> %a0) { -; SSE2-LABEL: trunc_usat_v8i64_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm6, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm6, %xmm3 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm7, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm6, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v8i64_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535] -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm2, %xmm5 -; SSSE3-NEXT: pxor %xmm6, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991] -; SSSE3-NEXT: movdqa %xmm9, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm4, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm6, %xmm2 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm6, %xmm3 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm7, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm1, %xmm6 -; SSSE3-NEXT: movdqa %xmm9, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v8i64_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [65535,65535] -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: packusdw %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: packusdw %xmm5, %xmm9 -; SSE41-NEXT: packusdw %xmm9, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v8i64_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [65535,65535] -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v8i64_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343] -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_usat_v8i64_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovusqw %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp ult <8 x i64> %a0, - %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> - %3 = trunc <8 x i64> %2 to <8 x i16> - ret <8 x i16> %3 -} - -define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) { -; SSE2-LABEL: trunc_usat_v8i32_v8i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm5 -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm4 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pslld $16, %xmm4 -; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm4, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v8i32_v8i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm6 -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm1, %xmm5 -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v8i32_v8i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSE41-NEXT: pminud %xmm2, %xmm1 -; SSE41-NEXT: pminud %xmm2, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v8i32_v8i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v8i32_v8i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_usat_v8i32_v8i16: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_usat_v8i32_v8i16: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_usat_v8i32_v8i16: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i16: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp ult <8 x i32> %a0, - %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> - %3 = trunc <8 x i32> %2 to <8 x i16> - ret <8 x i16> %3 -} - -define <16 x i16> @trunc_usat_v16i32_v16i16(<16 x i32> %a0) { -; SSE2-LABEL: trunc_usat_v16i32_v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm8 -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm7, %xmm7 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm7, %xmm2 -; SSE2-NEXT: por %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm6, %xmm3 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm7, %xmm4 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm8, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: pand %xmm8, %xmm5 -; SSE2-NEXT: por %xmm7, %xmm5 -; SSE2-NEXT: pslld $16, %xmm5 -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm5, %xmm0 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: packssdw %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v16i32_v16i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm1, %xmm8 -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: pxor %xmm6, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183] -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1 -; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pxor %xmm7, %xmm1 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm6, %xmm4 -; SSSE3-NEXT: movdqa %xmm5, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm7, %xmm2 -; SSSE3-NEXT: por %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm6, %xmm3 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm7, %xmm4 -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm8, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: pand %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm7, %xmm5 -; SSSE3-NEXT: pslld $16, %xmm5 -; SSSE3-NEXT: psrad $16, %xmm5 -; SSSE3-NEXT: pslld $16, %xmm0 -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: packssdw %xmm5, %xmm0 -; SSSE3-NEXT: pslld $16, %xmm2 -; SSSE3-NEXT: psrad $16, %xmm2 -; SSSE3-NEXT: pslld $16, %xmm1 -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: packssdw %xmm2, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v16i32_v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535] -; SSE41-NEXT: pminud %xmm4, %xmm3 -; SSE41-NEXT: pminud %xmm4, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pminud %xmm4, %xmm1 -; SSE41-NEXT: pminud %xmm4, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v16i32_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535] -; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v16i32_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_usat_v16i32_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovusdw %zmm0, %ymm0 -; AVX512-NEXT: retq - %1 = icmp ult <16 x i32> %a0, - %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> - %3 = trunc <16 x i32> %2 to <16 x i16> - ret <16 x i16> %3 -} - -; -; Unsigned saturation truncation to v16i8 -; - -define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) { -; SSE2-LABEL: trunc_usat_v8i64_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm9, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm5, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm6 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v8i64_v8i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pxor %xmm6, %xmm0 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] -; SSSE3-NEXT: movdqa %xmm9, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: pxor %xmm6, %xmm0 -; SSSE3-NEXT: movdqa %xmm9, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm4, %xmm0 -; SSSE3-NEXT: packuswb %xmm5, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm6, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm6 -; SSSE3-NEXT: movdqa %xmm9, %xmm1 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm2, %xmm1 -; SSSE3-NEXT: packuswb %xmm4, %xmm1 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: packuswb %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v8i64_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: packusdw %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: packusdw %xmm5, %xmm9 -; SSE41-NEXT: packusdw %xmm9, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm1 -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v8i64_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255] -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v8i64_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm4 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_usat_v8i64_v8i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp ult <8 x i64> %a0, - %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> - %3 = trunc <8 x i64> %2 to <8 x i8> - ret <8 x i8> %3 -} - -define void @trunc_usat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) { -; SSE2-LABEL: trunc_usat_v8i64_v8i8_store: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm5, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm10, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; SSE2-NEXT: por %xmm7, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm5, %xmm0 -; SSE2-NEXT: movdqa %xmm9, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pand %xmm6, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm9, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm0 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm0, %xmm1 -; SSE2-NEXT: movq %xmm1, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v8i64_v8i8_store: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSSE3-NEXT: pxor %xmm5, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711] -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm5, %xmm1 -; SSSE3-NEXT: movdqa %xmm9, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6 -; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm10, %xmm7 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3] -; SSSE3-NEXT: por %xmm7, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm0, %xmm1 -; SSSE3-NEXT: packuswb %xmm4, %xmm1 -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm5, %xmm0 -; SSSE3-NEXT: movdqa %xmm9, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4 -; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSSE3-NEXT: por %xmm0, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm5 -; SSSE3-NEXT: movdqa %xmm9, %xmm0 -; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm0 -; SSSE3-NEXT: pand %xmm0, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm0 -; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: packuswb %xmm4, %xmm0 -; SSSE3-NEXT: packuswb %xmm0, %xmm1 -; SSSE3-NEXT: packuswb %xmm0, %xmm1 -; SSSE3-NEXT: movq %xmm1, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v8i64_v8i8_store: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1 -; SSE41-NEXT: packusdw %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm7, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm5 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm5 -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2] -; SSE41-NEXT: pand %xmm5, %xmm0 -; SSE41-NEXT: por %xmm6, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm5 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5 -; SSE41-NEXT: pxor %xmm2, %xmm7 -; SSE41-NEXT: movdqa %xmm4, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm7, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm4, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9 -; SSE41-NEXT: packusdw %xmm5, %xmm9 -; SSE41-NEXT: packusdw %xmm9, %xmm1 -; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: movq %xmm1, (%rdi) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v8i64_v8i8_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063] -; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6 -; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255] -; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2 -; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v8i64_v8i8_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm4 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] -; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_usat_v8i64_v8i8_store: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovusqb %zmm0, (%rdi) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp ult <8 x i64> %a0, - %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> - %3 = trunc <8 x i64> %2 to <8 x i8> - store <8 x i8> %3, <8 x i8> *%p1 - ret void -} - -define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) { -; SSE2-LABEL: trunc_usat_v16i64_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] -; SSE2-NEXT: movdqa %xmm1, %xmm11 -; SSE2-NEXT: pxor %xmm9, %xmm11 -; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711] -; SSE2-NEXT: movdqa %xmm10, %xmm12 -; SSE2-NEXT: pcmpgtd %xmm11, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] -; SSE2-NEXT: por %xmm11, %xmm12 -; SSE2-NEXT: pand %xmm12, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm12 -; SSE2-NEXT: por %xmm1, %xmm12 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: movdqa %xmm10, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm13, %xmm14 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm14, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm12, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: movdqa %xmm10, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm9, %xmm3 -; SSE2-NEXT: movdqa %xmm10, %xmm11 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm11 -; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] -; SSE2-NEXT: pand %xmm12, %xmm13 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] -; SSE2-NEXT: por %xmm13, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: packuswb %xmm1, %xmm3 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm5 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm9, %xmm1 -; SSE2-NEXT: movdqa %xmm10, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSE2-NEXT: pand %xmm11, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm1 -; SSE2-NEXT: pand %xmm1, %xmm4 -; SSE2-NEXT: pandn %xmm8, %xmm1 -; SSE2-NEXT: por %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm9, %xmm2 -; SSE2-NEXT: movdqa %xmm10, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: por %xmm2, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm7 -; SSE2-NEXT: pandn %xmm8, %xmm3 -; SSE2-NEXT: por %xmm7, %xmm3 -; SSE2-NEXT: pxor %xmm6, %xmm9 -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: pcmpgtd %xmm9, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSE2-NEXT: pcmpeqd %xmm10, %xmm9 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] -; SSE2-NEXT: pand %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: por %xmm5, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm6 -; SSE2-NEXT: pandn %xmm8, %xmm2 -; SSE2-NEXT: por %xmm6, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm1 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v16i64_v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456] -; SSSE3-NEXT: movdqa %xmm1, %xmm11 -; SSSE3-NEXT: pxor %xmm9, %xmm11 -; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711] -; SSSE3-NEXT: movdqa %xmm10, %xmm12 -; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3] -; SSSE3-NEXT: por %xmm11, %xmm12 -; SSSE3-NEXT: pand %xmm12, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm12 -; SSSE3-NEXT: por %xmm1, %xmm12 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: movdqa %xmm10, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm13, %xmm14 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm14, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm1, %xmm0 -; SSSE3-NEXT: packuswb %xmm12, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: movdqa %xmm10, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm12, %xmm13 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm13, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: pxor %xmm9, %xmm3 -; SSSE3-NEXT: movdqa %xmm10, %xmm11 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm11 -; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3] -; SSSE3-NEXT: pand %xmm12, %xmm13 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3] -; SSSE3-NEXT: por %xmm13, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: packuswb %xmm1, %xmm3 -; SSSE3-NEXT: packuswb %xmm3, %xmm0 -; SSSE3-NEXT: movdqa %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: movdqa %xmm10, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm3, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm1, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm5 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: movdqa %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm9, %xmm1 -; SSSE3-NEXT: movdqa %xmm10, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] -; SSSE3-NEXT: pand %xmm11, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm1 -; SSSE3-NEXT: pand %xmm1, %xmm4 -; SSSE3-NEXT: pandn %xmm8, %xmm1 -; SSSE3-NEXT: por %xmm4, %xmm1 -; SSSE3-NEXT: packuswb %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm7, %xmm2 -; SSSE3-NEXT: pxor %xmm9, %xmm2 -; SSSE3-NEXT: movdqa %xmm10, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSSE3-NEXT: por %xmm2, %xmm3 -; SSSE3-NEXT: pand %xmm3, %xmm7 -; SSSE3-NEXT: pandn %xmm8, %xmm3 -; SSSE3-NEXT: por %xmm7, %xmm3 -; SSSE3-NEXT: pxor %xmm6, %xmm9 -; SSSE3-NEXT: movdqa %xmm10, %xmm2 -; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2] -; SSSE3-NEXT: pcmpeqd %xmm10, %xmm9 -; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3] -; SSSE3-NEXT: pand %xmm4, %xmm5 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSSE3-NEXT: por %xmm5, %xmm2 -; SSSE3-NEXT: pand %xmm2, %xmm6 -; SSSE3-NEXT: pandn %xmm8, %xmm2 -; SSSE3-NEXT: por %xmm6, %xmm2 -; SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSSE3-NEXT: packuswb %xmm2, %xmm1 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v16i64_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa %xmm0, %xmm8 -; SSE41-NEXT: movapd {{.*#+}} xmm9 = [255,255] -; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456] -; SSE41-NEXT: movdqa %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm11, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711] -; SSE41-NEXT: movdqa %xmm10, %xmm12 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm12 -; SSE41-NEXT: movdqa %xmm10, %xmm13 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm13 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2] -; SSE41-NEXT: pand %xmm12, %xmm0 -; SSE41-NEXT: por %xmm13, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm12 -; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm12 -; SSE41-NEXT: movdqa %xmm8, %xmm0 -; SSE41-NEXT: pxor %xmm11, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm13 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm13 -; SSE41-NEXT: movdqa %xmm10, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm13, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm13 -; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm13 -; SSE41-NEXT: packusdw %xmm12, %xmm13 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: pxor %xmm11, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm8 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm8 -; SSE41-NEXT: movdqa %xmm10, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm8, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm8 -; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8 -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: pxor %xmm11, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm3 -; SSE41-NEXT: movdqa %xmm10, %xmm1 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1 -; SSE41-NEXT: packusdw %xmm8, %xmm1 -; SSE41-NEXT: packusdw %xmm1, %xmm13 -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: pxor %xmm11, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm10, %xmm2 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1 -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: pxor %xmm11, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm10, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm2 -; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2 -; SSE41-NEXT: packusdw %xmm1, %xmm2 -; SSE41-NEXT: movdqa %xmm7, %xmm0 -; SSE41-NEXT: pxor %xmm11, %xmm0 -; SSE41-NEXT: movdqa %xmm10, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm10, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2] -; SSE41-NEXT: pand %xmm1, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: movapd %xmm9, %xmm1 -; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1 -; SSE41-NEXT: pxor %xmm6, %xmm11 -; SSE41-NEXT: movdqa %xmm10, %xmm3 -; SSE41-NEXT: pcmpeqd %xmm11, %xmm3 -; SSE41-NEXT: pcmpgtd %xmm11, %xmm10 -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2] -; SSE41-NEXT: pand %xmm3, %xmm0 -; SSE41-NEXT: por %xmm10, %xmm0 -; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm9 -; SSE41-NEXT: packusdw %xmm1, %xmm9 -; SSE41-NEXT: packusdw %xmm9, %xmm2 -; SSE41-NEXT: packuswb %xmm2, %xmm13 -; SSE41-NEXT: movdqa %xmm13, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v16i64_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa %ymm0, %ymm8 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808] -; AVX1-NEXT: vpxor %xmm5, %xmm8, %xmm4 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063] -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm11 -; AVX1-NEXT: vpxor %xmm5, %xmm11, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm10 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14 -; AVX1-NEXT: vpxor %xmm5, %xmm14, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm12 -; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm13 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7 -; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm15 -; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm4 -; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 -; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm5 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [255,255] -; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm9 -; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm15, %xmm7, %xmm6, %xmm4 -; AVX1-NEXT: vblendvpd %xmm13, %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vblendvpd %xmm12, %xmm14, %xmm6, %xmm5 -; AVX1-NEXT: vblendvpd %xmm10, %xmm1, %xmm6, %xmm1 -; AVX1-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vblendvpd %xmm0, %xmm11, %xmm6, %xmm7 -; AVX1-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX1-NEXT: vblendvpd %xmm0, %xmm8, %xmm6, %xmm6 -; AVX1-NEXT: vpackusdw %xmm9, %xmm3, %xmm0 -; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm2 -; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v16i64_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255] -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063] -; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6 -; AVX2-NEXT: vblendvpd %ymm6, %ymm1, %ymm4, %ymm1 -; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm6 -; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6 -; AVX2-NEXT: vblendvpd %ymm6, %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm1 -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm3 -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm7, %ymm3 -; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_usat_v16i64_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255] -; AVX512-NEXT: vpminuq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vpminuq %zmm2, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp ult <16 x i64> %a0, - %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> - %3 = trunc <16 x i64> %2 to <16 x i8> - ret <16 x i8> %3 -} - -define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) { -; SSE2-LABEL: trunc_usat_v8i32_v8i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm4, %xmm5 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm5, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pandn %xmm3, %xmm6 -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm2 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm6, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v8i32_v8i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm6 -; SSSE3-NEXT: por %xmm6, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm1, %xmm5 -; SSSE3-NEXT: pshufb %xmm1, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v8i32_v8i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSE41-NEXT: pminud %xmm2, %xmm0 -; SSE41-NEXT: pminud %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v8i32_v8i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v8i32_v8i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_usat_v8i32_v8i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_usat_v8i32_v8i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_usat_v8i32_v8i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp ult <8 x i32> %a0, - %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> - %3 = trunc <8 x i32> %2 to <8 x i8> - ret <8 x i8> %3 -} - -define void @trunc_usat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) { -; SSE2-LABEL: trunc_usat_v8i32_v8i8_store: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm3, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm5, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm6, %xmm1 -; SSE2-NEXT: pandn %xmm2, %xmm6 -; SSE2-NEXT: por %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm5 -; SSE2-NEXT: por %xmm0, %xmm5 -; SSE2-NEXT: packuswb %xmm6, %xmm5 -; SSE2-NEXT: packuswb %xmm0, %xmm5 -; SSE2-NEXT: movq %xmm5, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v8i32_v8i8_store: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm0, %xmm4 -; SSSE3-NEXT: pxor %xmm3, %xmm4 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm5, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6 -; SSSE3-NEXT: pand %xmm6, %xmm0 -; SSSE3-NEXT: pandn %xmm2, %xmm6 -; SSSE3-NEXT: por %xmm0, %xmm6 -; SSSE3-NEXT: pxor %xmm1, %xmm3 -; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm1 -; SSSE3-NEXT: pandn %xmm2, %xmm5 -; SSSE3-NEXT: por %xmm1, %xmm5 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm0, %xmm5 -; SSSE3-NEXT: pshufb %xmm0, %xmm6 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] -; SSSE3-NEXT: movq %xmm6, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v8i32_v8i8_store: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255] -; SSE41-NEXT: pminud %xmm2, %xmm0 -; SSE41-NEXT: pminud %xmm2, %xmm1 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE41-NEXT: movq %xmm0, (%rdi) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v8i32_v8i8_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255] -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; AVX1-NEXT: vmovq %xmm0, (%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v8i32_v8i8_store: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_usat_v8i32_v8i8_store: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_usat_v8i32_v8i8_store: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_usat_v8i32_v8i8_store: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255] -; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8_store: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp ult <8 x i32> %a0, - %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> - %3 = trunc <8 x i32> %2 to <8 x i8> - store <8 x i8> %3, <8 x i8> *%p1 - ret void -} - -define <16 x i8> @trunc_usat_v16i32_v16i8(<16 x i32> %a0) { -; SSE2-LABEL: trunc_usat_v16i32_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] -; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSE2-NEXT: movdqa %xmm1, %xmm7 -; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm7 -; SSE2-NEXT: pand %xmm7, %xmm0 -; SSE2-NEXT: pandn %xmm8, %xmm7 -; SSE2-NEXT: por %xmm7, %xmm0 -; SSE2-NEXT: packuswb %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pandn %xmm8, %xmm4 -; SSE2-NEXT: por %xmm3, %xmm4 -; SSE2-NEXT: pxor %xmm2, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm6, %xmm5 -; SSE2-NEXT: pand %xmm5, %xmm2 -; SSE2-NEXT: pandn %xmm8, %xmm5 -; SSE2-NEXT: por %xmm2, %xmm5 -; SSE2-NEXT: packuswb %xmm4, %xmm5 -; SSE2-NEXT: packuswb %xmm5, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v16i32_v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255] -; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648] -; SSSE3-NEXT: movdqa %xmm1, %xmm7 -; SSSE3-NEXT: pxor %xmm6, %xmm7 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903] -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm1, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm6, %xmm1 -; SSSE3-NEXT: movdqa %xmm5, %xmm7 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7 -; SSSE3-NEXT: pand %xmm7, %xmm0 -; SSSE3-NEXT: pandn %xmm8, %xmm7 -; SSSE3-NEXT: por %xmm7, %xmm0 -; SSSE3-NEXT: packuswb %xmm4, %xmm0 -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm6, %xmm1 -; SSSE3-NEXT: movdqa %xmm5, %xmm4 -; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4 -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pandn %xmm8, %xmm4 -; SSSE3-NEXT: por %xmm3, %xmm4 -; SSSE3-NEXT: pxor %xmm2, %xmm6 -; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5 -; SSSE3-NEXT: pand %xmm5, %xmm2 -; SSSE3-NEXT: pandn %xmm8, %xmm5 -; SSSE3-NEXT: por %xmm2, %xmm5 -; SSSE3-NEXT: packuswb %xmm4, %xmm5 -; SSSE3-NEXT: packuswb %xmm5, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v16i32_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255] -; SSE41-NEXT: pminud %xmm4, %xmm1 -; SSE41-NEXT: pminud %xmm4, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: pminud %xmm4, %xmm3 -; SSE41-NEXT: pminud %xmm4, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: packuswb %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v16i32_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255] -; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v16i32_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc_usat_v16i32_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovusdb %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq - %1 = icmp ult <16 x i32> %a0, - %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> - %3 = trunc <16 x i32> %2 to <16 x i8> - ret <16 x i8> %3 -} - -define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) { -; SSE2-LABEL: trunc_usat_v16i16_v16i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023] -; SSE2-NEXT: pminsw %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: pminsw %xmm3, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v16i16_v16i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023] -; SSSE3-NEXT: pminsw %xmm3, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: pminsw %xmm3, %xmm0 -; SSSE3-NEXT: pxor %xmm2, %xmm0 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v16i16_v16i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pminuw %xmm2, %xmm1 -; SSE41-NEXT: pminuw %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v16i16_v16i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v16i16_v16i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_usat_v16i16_v16i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_usat_v16i16_v16i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_usat_v16i16_v16i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_usat_v16i16_v16i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = icmp ult <16 x i16> %a0, - %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> - %3 = trunc <16 x i16> %2 to <16 x i8> - ret <16 x i8> %3 -} - -define <32 x i8> @trunc_usat_v32i16_v32i8(<32 x i16> %a0) { -; SSE2-LABEL: trunc_usat_v32i16_v32i8: -; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [33023,33023,33023,33023,33023,33023,33023,33023] -; SSE2-NEXT: pminsw %xmm5, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: pminsw %xmm5, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pminsw %xmm5, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: pminsw %xmm5, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc_usat_v32i16_v32i8: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768] -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [33023,33023,33023,33023,33023,33023,33023,33023] -; SSSE3-NEXT: pminsw %xmm5, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: pminsw %xmm5, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm2 -; SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pminsw %xmm5, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: pminsw %xmm5, %xmm0 -; SSSE3-NEXT: pxor %xmm4, %xmm0 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_usat_v32i16_v32i8: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pminuw %xmm4, %xmm3 -; SSE41-NEXT: pminuw %xmm4, %xmm2 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: pminuw %xmm4, %xmm1 -; SSE41-NEXT: pminuw %xmm4, %xmm0 -; SSE41-NEXT: packuswb %xmm1, %xmm0 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_usat_v32i16_v32i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vpminuw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpminuw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpminuw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc_usat_v32i16_v32i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpminuw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc_usat_v32i16_v32i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512F-NEXT: vpminuw %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_usat_v32i16_v32i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpminuw %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpminuw %ymm2, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc_usat_v32i16_v32i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0 -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc_usat_v32i16_v32i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0 -; AVX512BWVL-NEXT: retq - %1 = icmp ult <32 x i16> %a0, - %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> - %3 = trunc <32 x i16> %2 to <32 x i8> - ret <32 x i8> %3 -} diff --git a/test/CodeGen/X86/vector-trunc-widen.ll b/test/CodeGen/X86/vector-trunc-widen.ll deleted file mode 100644 index 42bf9a7c049..00000000000 --- a/test/CodeGen/X86/vector-trunc-widen.ll +++ /dev/null @@ -1,2126 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL - -define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) { -; SSE-LABEL: trunc8i64_8i32: -; SSE: # %bb.0: # %entry -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc8i64_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc8i64_8i32: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc8i64_8i32: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc8i64_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: retq -entry: - %0 = trunc <8 x i64> %a to <8 x i32> - ret <8 x i32> %0 -} - -define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) { -; SSE-LABEL: trunc8i64_8i32_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc8i64_8i32_ashr: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc8i64_8i32_ashr: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: retq -entry: - %0 = ashr <8 x i64> %a, - %1 = trunc <8 x i64> %0 to <8 x i32> - ret <8 x i32> %1 -} - -define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) { -; SSE-LABEL: trunc8i64_8i32_lshr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc8i64_8i32_lshr: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc8i64_8i32_lshr: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0 -; AVX512-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512-NEXT: retq -entry: - %0 = lshr <8 x i64> %a, - %1 = trunc <8 x i64> %0 to <8 x i32> - ret <8 x i32> %1 -} - -define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { -; SSE2-LABEL: trunc8i64_8i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc8i64_8i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i64_8i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7] -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7] -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: packusdw %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc8i64_8i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc8i64_8i16: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc8i64_8i16: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512-LABEL: trunc8i64_8i16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -entry: - %0 = trunc <8 x i64> %a to <8 x i16> - ret <8 x i16> %0 -} - -define void @trunc8i64_8i8(<8 x i64> %a) { -; SSE2-LABEL: trunc8i64_8i8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc8i64_8i8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSSE3-NEXT: packuswb %xmm0, %xmm0 -; SSSE3-NEXT: movq %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i64_8i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: packusdw %xmm2, %xmm0 -; SSE41-NEXT: packuswb %xmm0, %xmm0 -; SSE41-NEXT: movq %xmm0, (%rax) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc8i64_8i8: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovq %xmm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc8i64_8i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc8i64_8i8: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovqb %zmm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -entry: - %0 = trunc <8 x i64> %a to <8 x i8> - store <8 x i8> %0, <8 x i8>* undef, align 4 - ret void -} - -define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) { -; SSE2-LABEL: trunc8i32_8i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc8i32_8i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i32_8i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc8i32_8i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc8i32_8i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc8i32_8i16: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc8i32_8i16: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc8i32_8i16: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc8i32_8i16: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -entry: - %0 = trunc <8 x i32> %a to <8 x i16> - ret <8 x i16> %0 -} - -define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) { -; SSE-LABEL: trunc8i32_8i16_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc8i32_8i16_ashr: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc8i32_8i16_ashr: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc8i32_8i16_ashr: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpsrad $16, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc8i32_8i16_ashr: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpsrad $16, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc8i32_8i16_ashr: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpsrad $16, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc8i32_8i16_ashr: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpsrad $16, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -entry: - %0 = ashr <8 x i32> %a, - %1 = trunc <8 x i32> %0 to <8 x i16> - ret <8 x i16> %1 -} - -define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) { -; SSE2-LABEL: trunc8i32_8i16_lshr: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc8i32_8i16_lshr: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,10,11,14,15,14,15,255,255] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i32_8i16_lshr: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc8i32_8i16_lshr: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc8i32_8i16_lshr: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc8i32_8i16_lshr: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc8i32_8i16_lshr: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc8i32_8i16_lshr: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc8i32_8i16_lshr: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -entry: - %0 = lshr <8 x i32> %a, - %1 = trunc <8 x i32> %0 to <8 x i16> - ret <8 x i16> %1 -} - -define void @trunc8i32_8i8(<8 x i32> %a) { -; SSE2-LABEL: trunc8i32_8i8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc8i32_8i8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movq %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i32_8i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE41-NEXT: movq %xmm0, (%rax) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc8i32_8i8: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX1-NEXT: vmovq %xmm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc8i32_8i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; AVX2-NEXT: vmovq %xmm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc8i32_8i8: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512F-NEXT: vmovq %xmm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc8i32_8i8: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovdb %ymm0, (%rax) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc8i32_8i8: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc8i32_8i8: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -entry: - %0 = trunc <8 x i32> %a to <8 x i8> - store <8 x i8> %0, <8 x i8>* undef, align 4 - ret void -} - -define void @trunc16i32_16i16(<16 x i32> %a) { -; SSE2-LABEL: trunc16i32_16i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc16i32_16i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pslld $16, %xmm1 -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: pslld $16, %xmm0 -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSSE3-NEXT: pslld $16, %xmm3 -; SSSE3-NEXT: psrad $16, %xmm3 -; SSSE3-NEXT: pslld $16, %xmm2 -; SSSE3-NEXT: psrad $16, %xmm2 -; SSSE3-NEXT: packssdw %xmm3, %xmm2 -; SSSE3-NEXT: movdqu %xmm2, (%rax) -; SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i32_16i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: movdqu %xmm2, (%rax) -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc16i32_16i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535] -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) -; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc16i32_16i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqu %xmm1, (%rax) -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc16i32_16i16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovdw %zmm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -entry: - %0 = trunc <16 x i32> %a to <16 x i16> - store <16 x i16> %0, <16 x i16>* undef, align 4 - ret void -} - -define void @trunc16i32_16i16_ashr(<16 x i32> %a) { -; SSE-LABEL: trunc16i32_16i16_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrad $16, %xmm3 -; SSE-NEXT: psrad $16, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: movdqu %xmm2, (%rax) -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc16i32_16i16_ashr: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) -; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc16i32_16i16_ashr: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc16i32_16i16_ashr: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdw %zmm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -entry: - %0 = ashr <16 x i32> %a, - %1 = trunc <16 x i32> %0 to <16 x i16> - store <16 x i16> %1, <16 x i16>* undef, align 4 - ret void -} - -define void @trunc16i32_16i16_lshr(<16 x i32> %a) { -; SSE2-LABEL: trunc16i32_16i16_lshr: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc16i32_16i16_lshr: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: psrad $16, %xmm1 -; SSSE3-NEXT: psrad $16, %xmm0 -; SSSE3-NEXT: packssdw %xmm1, %xmm0 -; SSSE3-NEXT: psrad $16, %xmm3 -; SSSE3-NEXT: psrad $16, %xmm2 -; SSSE3-NEXT: packssdw %xmm3, %xmm2 -; SSSE3-NEXT: movdqu %xmm2, (%rax) -; SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i32_16i16_lshr: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: psrld $16, %xmm3 -; SSE41-NEXT: psrld $16, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: movdqu %xmm2, (%rax) -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc16i32_16i16_lshr: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) -; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc16i32_16i16_lshr: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc16i32_16i16_lshr: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdw %zmm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -entry: - %0 = lshr <16 x i32> %a, - %1 = trunc <16 x i32> %0 to <16 x i16> - store <16 x i16> %1, <16 x i16>* undef, align 4 - ret void -} - -define void @trunc16i32_16i8(<16 x i32> %a) { -; SSE2-LABEL: trunc16i32_16i8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc16i32_16i8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i32_16i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: packuswb %xmm2, %xmm0 -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc16i32_16i8: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc16i32_16i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc16i32_16i8: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovdb %zmm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -entry: - %0 = trunc <16 x i32> %a to <16 x i8> - store <16 x i8> %0, <16 x i8>* undef, align 4 - ret void -} - -define void @trunc16i32_16i8_ashr(<16 x i32> %a) { -; SSE-LABEL: trunc16i32_16i8_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: packssdw %xmm1, %xmm0 -; SSE-NEXT: psrad $24, %xmm3 -; SSE-NEXT: psrad $24, %xmm2 -; SSE-NEXT: packssdw %xmm3, %xmm2 -; SSE-NEXT: packsswb %xmm2, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc16i32_16i8_ashr: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1 -; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc16i32_16i8_ashr: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1 -; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0 -; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc16i32_16i8_ashr: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -entry: - %0 = ashr <16 x i32> %a, - %1 = trunc <16 x i32> %0 to <16 x i8> - store <16 x i8> %1, <16 x i8>* undef, align 4 - ret void -} - -define void @trunc16i32_16i8_lshr(<16 x i32> %a) { -; SSE2-LABEL: trunc16i32_16i8_lshr: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: psrld $24, %xmm1 -; SSE2-NEXT: psrld $24, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: psrld $24, %xmm3 -; SSE2-NEXT: psrld $24, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: packuswb %xmm2, %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc16i32_16i8_lshr: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: psrld $24, %xmm1 -; SSSE3-NEXT: psrld $24, %xmm0 -; SSSE3-NEXT: packuswb %xmm1, %xmm0 -; SSSE3-NEXT: psrld $24, %xmm3 -; SSSE3-NEXT: psrld $24, %xmm2 -; SSSE3-NEXT: packuswb %xmm3, %xmm2 -; SSSE3-NEXT: packuswb %xmm2, %xmm0 -; SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i32_16i8_lshr: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: psrld $24, %xmm1 -; SSE41-NEXT: psrld $24, %xmm0 -; SSE41-NEXT: packusdw %xmm1, %xmm0 -; SSE41-NEXT: psrld $24, %xmm3 -; SSE41-NEXT: psrld $24, %xmm2 -; SSE41-NEXT: packusdw %xmm3, %xmm2 -; SSE41-NEXT: packuswb %xmm2, %xmm0 -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc16i32_16i8_lshr: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2 -; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1 -; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc16i32_16i8_lshr: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1 -; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0 -; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: trunc16i32_16i8_lshr: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0 -; AVX512-NEXT: vpmovdb %zmm0, (%rax) -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq -entry: - %0 = lshr <16 x i32> %a, - %1 = trunc <16 x i32> %0 to <16 x i8> - store <16 x i8> %1, <16 x i8>* undef, align 4 - ret void -} - -;PR25684 -define void @trunc16i16_16i8(<16 x i16> %a) { -; SSE2-LABEL: trunc16i16_16i8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc16i16_16i8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc16i16_16i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc16i16_16i8: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc16i16_16i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc16i16_16i8: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc16i16_16i8: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc16i16_16i8: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc16i16_16i8: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -entry: - %0 = trunc <16 x i16> %a to <16 x i8> - store <16 x i8> %0, <16 x i8>* undef, align 4 - ret void -} - -define void @trunc16i16_16i8_ashr(<16 x i16> %a) { -; SSE-LABEL: trunc16i16_16i8_ashr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psraw $8, %xmm1 -; SSE-NEXT: psraw $8, %xmm0 -; SSE-NEXT: packsswb %xmm1, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc16i16_16i8_ashr: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc16i16_16i8_ashr: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc16i16_16i8_ashr: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc16i16_16i8_ashr: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc16i16_16i8_ashr: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpsraw $8, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc16i16_16i8_ashr: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -entry: - %0 = ashr <16 x i16> %a, - %1 = trunc <16 x i16> %0 to <16 x i8> - store <16 x i8> %1, <16 x i8>* undef, align 4 - ret void -} - -define void @trunc16i16_16i8_lshr(<16 x i16> %a) { -; SSE-LABEL: trunc16i16_16i8_lshr: -; SSE: # %bb.0: # %entry -; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: movdqu %xmm0, (%rax) -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc16i16_16i8_lshr: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc16i16_16i8_lshr: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqu %xmm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc16i16_16i8_lshr: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc16i16_16i8_lshr: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc16i16_16i8_lshr: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqu %xmm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc16i16_16i8_lshr: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -entry: - %0 = lshr <16 x i16> %a, - %1 = trunc <16 x i16> %0 to <16 x i8> - store <16 x i8> %1, <16 x i8>* undef, align 4 - ret void -} - -define void @trunc32i16_32i8(<32 x i16> %a) { -; SSE2-LABEL: trunc32i16_32i8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm3, %xmm2 -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm0, (%rax) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc32i16_32i8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm4, %xmm1 -; SSSE3-NEXT: pshufb %xmm4, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb %xmm4, %xmm3 -; SSSE3-NEXT: pshufb %xmm4, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSSE3-NEXT: movdqu %xmm2, (%rax) -; SSSE3-NEXT: movdqu %xmm0, (%rax) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc32i16_32i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm4, %xmm1 -; SSE41-NEXT: pshufb %xmm4, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pshufb %xmm4, %xmm3 -; SSE41-NEXT: pshufb %xmm4, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: movdqu %xmm2, (%rax) -; SSE41-NEXT: movdqu %xmm0, (%rax) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc32i16_32i8: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovdqu %xmm1, (%rax) -; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: trunc32i16_32i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: vmovdqu %ymm0, (%rax) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: trunc32i16_32i8: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm1, (%rax) -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, (%rax) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc32i16_32i8: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpmovdb %zmm1, (%rax) -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, (%rax) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc32i16_32i8: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpmovwb %zmm0, (%rax) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc32i16_32i8: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -entry: - %0 = trunc <32 x i16> %a to <32 x i8> - store <32 x i8> %0, <32 x i8>* undef, align 4 - ret void -} - -define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { -; SSE-LABEL: trunc2x4i64_8i32: -; SSE: # %bb.0: # %entry -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE-NEXT: movaps %xmm2, %xmm1 -; SSE-NEXT: retq -; -; AVX1-LABEL: trunc2x4i64_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc2x4i64_8i32: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc2x4i64_8i32: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7] -; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0 -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc2x4i64_8i32: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512F-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x4i64_8i32: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc2x4i64_8i32: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1 -; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc2x4i64_8i32: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1 -; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: retq -entry: - %0 = trunc <4 x i64> %a to <4 x i32> - %1 = trunc <4 x i64> %b to <4 x i32> - %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> - ret <8 x i32> %2 -} - -define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) { -; SSE2-LABEL: trunc2x4i64_8i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc2x4i64_8i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc2x4i64_8i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; SSE41-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc2x4i64_8i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc2x4i64_8i16: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc2x4i64_8i16: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15] -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc2x4i64_8i16: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512F-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x4i64_8i16: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc2x4i64_8i16: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 -; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc2x4i64_8i16: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpmovqw %ymm1, %xmm1 -; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq -entry: - %0 = trunc <4 x i64> %a to <4 x i16> - %1 = trunc <4 x i64> %b to <4 x i16> - %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> - ret <8 x i16> %2 -} - -define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) { -; SSE-LABEL: trunc2x2i64_4i32: -; SSE: # %bb.0: # %entry -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE-NEXT: retq -; -; AVX-LABEL: trunc2x2i64_4i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX-NEXT: retq -; -; AVX512-LABEL: trunc2x2i64_4i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; AVX512-NEXT: retq -entry: - %0 = trunc <2 x i64> %a to <2 x i32> - %1 = trunc <2 x i64> %b to <2 x i32> - %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> - ret <4 x i32> %2 -} - -define i64 @trunc2i64_i64(<2 x i64> %inval) { -; SSE-LABEL: trunc2i64_i64: -; SSE: # %bb.0: # %entry -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: retq -; -; AVX-LABEL: trunc2i64_i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq -; -; AVX512-LABEL: trunc2i64_i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: retq -entry: - %0 = trunc <2 x i64> %inval to <2 x i32> - %1 = bitcast <2 x i32> %0 to i64 - ret i64 %1 -} - -define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) { -; SSE2-LABEL: trunc2x4i32_8i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc2x4i32_8i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc2x4i32_8i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: retq -; -; AVX-LABEL: trunc2x4i32_8i16: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq -; -; AVX512F-LABEL: trunc2x4i32_8i16: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc2x4i32_8i16: -; AVX512VL: # %bb.0: # %entry -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: trunc2x4i32_8i16: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: trunc2x4i32_8i16: -; AVX512BWVL: # %bb.0: # %entry -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14] -; AVX512BWVL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 -; AVX512BWVL-NEXT: retq -entry: - %0 = trunc <4 x i32> %a to <4 x i16> - %1 = trunc <4 x i32> %b to <4 x i16> - %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> - ret <8 x i16> %2 -} - -; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 -define i64 @trunc4i32_i64(<4 x i32> %inval) { -; SSE2-LABEL: trunc4i32_i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc4i32_i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc4i32_i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: retq -; -; AVX-LABEL: trunc4i32_i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq -; -; AVX512-LABEL: trunc4i32_i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: retq -entry: - %0 = trunc <4 x i32> %inval to <4 x i16> - %1 = bitcast <4 x i16> %0 to i64 - ret i64 %1 -} - -define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) { -; SSE2-LABEL: trunc2x8i16_16i8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: packuswb %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc2x8i16_16i8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSSE3-NEXT: pshufb %xmm2, %xmm1 -; SSSE3-NEXT: pshufb %xmm2, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc2x8i16_16i8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; SSE41-NEXT: pshufb %xmm2, %xmm1 -; SSE41-NEXT: pshufb %xmm2, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: retq -; -; AVX-LABEL: trunc2x8i16_16i8: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: retq -; -; AVX512-LABEL: trunc2x8i16_16i8: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> -; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX512-NEXT: retq -entry: - %0 = trunc <8 x i16> %a to <8 x i8> - %1 = trunc <8 x i16> %b to <8 x i8> - %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> - ret <16 x i8> %2 -} - -; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524 -define i64 @trunc8i16_i64(<8 x i16> %inval) { -; SSE2-LABEL: trunc8i16_i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: packuswb %xmm0, %xmm0 -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: retq -; -; SSSE3-LABEL: trunc8i16_i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSSE3-NEXT: movq %xmm0, %rax -; SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc8i16_i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; SSE41-NEXT: movq %xmm0, %rax -; SSE41-NEXT: retq -; -; AVX-LABEL: trunc8i16_i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: retq -; -; AVX512-LABEL: trunc8i16_i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] -; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: retq -entry: - %0 = trunc <8 x i16> %inval to <8 x i8> - %1 = bitcast <8 x i8> %0 to i64 - ret i64 %1 -} - -define <16 x i8> @trunc16i64_16i8_const() { -; SSE-LABEL: trunc16i64_16i8_const: -; SSE: # %bb.0: # %entry -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: retq -; -; AVX-LABEL: trunc16i64_16i8_const: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq -; -; AVX512-LABEL: trunc16i64_16i8_const: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX512-NEXT: retq - -entry: - %0 = trunc <16 x i64> zeroinitializer to <16 x i8> - %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> - ret <16 x i8> %1 -} - -define <8 x i16> @PR32160(<8 x i32> %x) { -; SSE-LABEL: PR32160: -; SSE: # %bb.0: -; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] -; SSE-NEXT: retq -; -; AVX1-LABEL: PR32160: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: PR32160: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: PR32160: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9] -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: PR32160: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] -; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: PR32160: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: PR32160: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: PR32160: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0 -; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %shuf = trunc <8 x i32> %x to <8 x i16> - %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> - ret <8 x i16> %trunc -} - -define void @PR34773(i16* %a0, i8* %a1) { -; SSE-LABEL: PR34773: -; SSE: # %bb.0: -; SSE-NEXT: movdqu (%rdi), %xmm0 -; SSE-NEXT: movdqu 16(%rdi), %xmm1 -; SSE-NEXT: movdqu 32(%rdi), %xmm2 -; SSE-NEXT: movdqu 48(%rdi), %xmm3 -; SSE-NEXT: psrlw $8, %xmm1 -; SSE-NEXT: psrlw $8, %xmm0 -; SSE-NEXT: packuswb %xmm1, %xmm0 -; SSE-NEXT: psrlw $8, %xmm3 -; SSE-NEXT: psrlw $8, %xmm2 -; SSE-NEXT: packuswb %xmm3, %xmm2 -; SSE-NEXT: movdqu %xmm0, (%rsi) -; SSE-NEXT: movdqu %xmm2, 16(%rsi) -; SSE-NEXT: retq -; -; AVX1-LABEL: PR34773: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovdqu (%rdi), %xmm0 -; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 -; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 -; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 -; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm1 -; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vmovdqu %xmm0, (%rsi) -; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: PR34773: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqu (%rdi), %ymm0 -; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovdqu %xmm0, (%rsi) -; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: PR34773: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512F-NEXT: vpmovdb %zmm0, 16(%rsi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: PR34773: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi) -; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero -; AVX512VL-NEXT: vpmovdb %zmm0, 16(%rsi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: PR34773: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0 -; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0 -; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1 -; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 -; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi) -; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: PR34773: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0 -; AVX512BWVL-NEXT: vpsrlw $8, 32(%rdi), %ymm1 -; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi) -; AVX512BWVL-NEXT: vpmovwb %ymm1, 16(%rsi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %1 = getelementptr i16, i16* %a0, i64 16 - %2 = getelementptr i8, i8* %a1, i64 16 - %3 = bitcast i16* %a0 to <16 x i16>* - %4 = bitcast i16* %1 to <16 x i16>* - %5 = bitcast i8* %a1 to <16 x i8>* - %6 = bitcast i8* %2 to <16 x i8>* - %7 = load <16 x i16>, <16 x i16>* %3, align 2 - %8 = load <16 x i16>, <16 x i16>* %4, align 2 - %9 = lshr <16 x i16> %7, - %10 = lshr <16 x i16> %8, - %11 = trunc <16 x i16> %9 to <16 x i8> - %12 = trunc <16 x i16> %10 to <16 x i8> - store <16 x i8> %11, <16 x i8>* %5, align 1 - store <16 x i8> %12, <16 x i8>* %6, align 1 - ret void -} - -; Store merging must not infinitely fight store splitting. - -define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, <8 x i16>* %p) align 2 { -; SSE2-LABEL: store_merge_split: -; SSE2: # %bb.0: -; SSE2-NEXT: pslld $16, %xmm1 -; SSE2-NEXT: psrad $16, %xmm1 -; SSE2-NEXT: pslld $16, %xmm0 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: pslld $16, %xmm3 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: pslld $16, %xmm2 -; SSE2-NEXT: psrad $16, %xmm2 -; SSE2-NEXT: packssdw %xmm3, %xmm2 -; SSE2-NEXT: shlq $4, %rdi -; SSE2-NEXT: movdqu %xmm0, (%rsi,%rdi) -; SSE2-NEXT: movdqu %xmm2, 16(%rsi,%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: store_merge_split: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSSE3-NEXT: pshufb %xmm4, %xmm1 -; SSSE3-NEXT: pshufb %xmm4, %xmm0 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSSE3-NEXT: pshufb %xmm4, %xmm3 -; SSSE3-NEXT: pshufb %xmm4, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSSE3-NEXT: shlq $4, %rdi -; SSSE3-NEXT: movdqu %xmm0, (%rsi,%rdi) -; SSSE3-NEXT: movdqu %xmm2, 16(%rsi,%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: store_merge_split: -; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm4, %xmm1 -; SSE41-NEXT: pshufb %xmm4, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE41-NEXT: pshufb %xmm4, %xmm3 -; SSE41-NEXT: pshufb %xmm4, %xmm2 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: shlq $4, %rdi -; SSE41-NEXT: movdqu %xmm0, (%rsi,%rdi) -; SSE41-NEXT: movdqu %xmm2, 16(%rsi,%rdi) -; SSE41-NEXT: retq -; -; AVX1-LABEL: store_merge_split: -; AVX1: # %bb.0: -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX1-NEXT: shlq $4, %rdi -; AVX1-NEXT: vmovdqu %xmm0, (%rsi,%rdi) -; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: store_merge_split: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: shlq $4, %rdi -; AVX2-NEXT: vmovdqu %xmm0, (%rsi,%rdi) -; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512F-LABEL: store_merge_split: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512F-NEXT: shlq $4, %rdi -; AVX512F-NEXT: vmovdqu %xmm0, (%rsi,%rdi) -; AVX512F-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: store_merge_split: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: shlq $4, %rdi -; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) -; AVX512VL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) -; AVX512VL-NEXT: vzeroupper -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: store_merge_split: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 -; AVX512BW-NEXT: shlq $4, %rdi -; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi,%rdi) -; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq -; -; AVX512BWVL-LABEL: store_merge_split: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: shlq $4, %rdi -; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) -; AVX512BWVL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) -; AVX512BWVL-NEXT: vzeroupper -; AVX512BWVL-NEXT: retq - %t1 = trunc <8 x i32> %w1 to <8 x i16> - %t2 = trunc <8 x i32> %w2 to <8 x i16> - %g1 = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 %idx - %g2 = getelementptr inbounds <8 x i16>, <8 x i16>* %g1, i64 1 - store <8 x i16> %t1, <8 x i16>* %g1, align 2 - store <8 x i16> %t2, <8 x i16>* %g2, align 2 - ret void -} diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll index e9472b80871..b09d14e5e2b 100644 --- a/test/CodeGen/X86/vector-trunc.ll +++ b/test/CodeGen/X86/vector-trunc.ll @@ -1998,3 +1998,129 @@ define void @PR34773(i16* %a0, i8* %a1) { store <16 x i8> %12, <16 x i8>* %6, align 1 ret void } + +; Store merging must not infinitely fight store splitting. + +define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, <8 x i16>* %p) align 2 { +; SSE2-LABEL: store_merge_split: +; SSE2: # %bb.0: +; SSE2-NEXT: pslld $16, %xmm1 +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: pslld $16, %xmm0 +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: packssdw %xmm1, %xmm0 +; SSE2-NEXT: pslld $16, %xmm3 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: pslld $16, %xmm2 +; SSE2-NEXT: psrad $16, %xmm2 +; SSE2-NEXT: packssdw %xmm3, %xmm2 +; SSE2-NEXT: shlq $4, %rdi +; SSE2-NEXT: movdqu %xmm0, (%rsi,%rdi) +; SSE2-NEXT: movdqu %xmm2, 16(%rsi,%rdi) +; SSE2-NEXT: retq +; +; SSSE3-LABEL: store_merge_split: +; SSSE3: # %bb.0: +; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSSE3-NEXT: pshufb %xmm4, %xmm1 +; SSSE3-NEXT: pshufb %xmm4, %xmm0 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: pshufb %xmm4, %xmm3 +; SSSE3-NEXT: pshufb %xmm4, %xmm2 +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSSE3-NEXT: shlq $4, %rdi +; SSSE3-NEXT: movdqu %xmm0, (%rsi,%rdi) +; SSSE3-NEXT: movdqu %xmm2, 16(%rsi,%rdi) +; SSSE3-NEXT: retq +; +; SSE41-LABEL: store_merge_split: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; SSE41-NEXT: pshufb %xmm4, %xmm1 +; SSE41-NEXT: pshufb %xmm4, %xmm0 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: pshufb %xmm4, %xmm3 +; SSE41-NEXT: pshufb %xmm4, %xmm2 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] +; SSE41-NEXT: shlq $4, %rdi +; SSE41-NEXT: movdqu %xmm0, (%rsi,%rdi) +; SSE41-NEXT: movdqu %xmm2, 16(%rsi,%rdi) +; SSE41-NEXT: retq +; +; AVX1-LABEL: store_merge_split: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: shlq $4, %rdi +; AVX1-NEXT: vmovdqu %xmm0, (%rsi,%rdi) +; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: store_merge_split: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] +; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] +; AVX2-NEXT: shlq $4, %rdi +; AVX2-NEXT: vmovdqu %xmm0, (%rsi,%rdi) +; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: store_merge_split: +; AVX512F: # %bb.0: +; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512F-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512F-NEXT: shlq $4, %rdi +; AVX512F-NEXT: vmovdqu %xmm0, (%rsi,%rdi) +; AVX512F-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: store_merge_split: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: shlq $4, %rdi +; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) +; AVX512VL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) +; AVX512VL-NEXT: vzeroupper +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: store_merge_split: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 +; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 +; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1 +; AVX512BW-NEXT: shlq $4, %rdi +; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi,%rdi) +; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq +; +; AVX512BWVL-LABEL: store_merge_split: +; AVX512BWVL: # %bb.0: +; AVX512BWVL-NEXT: shlq $4, %rdi +; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi,%rdi) +; AVX512BWVL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi) +; AVX512BWVL-NEXT: vzeroupper +; AVX512BWVL-NEXT: retq + %t1 = trunc <8 x i32> %w1 to <8 x i16> + %t2 = trunc <8 x i32> %w2 to <8 x i16> + %g1 = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 %idx + %g2 = getelementptr inbounds <8 x i16>, <8 x i16>* %g1, i64 1 + store <8 x i16> %t1, <8 x i16>* %g1, align 2 + store <8 x i16> %t2, <8 x i16>* %g2, align 2 + ret void +} diff --git a/test/CodeGen/X86/vector-zext-widen.ll b/test/CodeGen/X86/vector-zext-widen.ll deleted file mode 100644 index 78c36d6fc8f..00000000000 --- a/test/CodeGen/X86/vector-zext-widen.ll +++ /dev/null @@ -1,2741 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW - -define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_16i8_to_8i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_16i8_to_8i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_16i8_to_8i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: retq -; -; AVX-LABEL: zext_16i8_to_8i16: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX-NEXT: retq -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> - %C = zext <8 x i8> %B to <8 x i16> - ret <8 x i16> %C -} - -; PR17654 -define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) { -; SSE2-LABEL: zext_16i8_to_16i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_16i8_to_16i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_16i8_to_16i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_16i8_to_16i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_16i8_to_16i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_16i8_to_16i16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512-NEXT: retq -entry: - %B = zext <16 x i8> %A to <16 x i16> - ret <16 x i16> %B -} - -define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) { -; SSE2-LABEL: zext_32i8_to_32i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_32i8_to_32i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_32i8_to_32i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_32i8_to_32i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_32i8_to_32i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512F-LABEL: zext_32i8_to_32i16: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512F-NEXT: vmovdqa %ymm2, %ymm0 -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: zext_32i8_to_32i16: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; AVX512BW-NEXT: retq -entry: - %B = zext <32 x i8> %A to <32 x i16> - ret <32 x i16> %B -} - -define <4 x i32> @zext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_16i8_to_4i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_16i8_to_4i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_16i8_to_4i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: zext_16i8_to_4i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: retq -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> - %C = zext <4 x i8> %B to <4 x i32> - ret <4 x i32> %C -} - -define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_16i8_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_16i8_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_16i8_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_16i8_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_16i8_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_16i8_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: retq -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> - %C = zext <8 x i8> %B to <8 x i32> - ret <8 x i32> %C -} - -define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_16i8_to_16i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_16i8_to_16i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_16i8_to_16i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_16i8_to_16i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_16i8_to_16i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_16i8_to_16i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: retq -entry: - %B = zext <16 x i8> %A to <16 x i32> - ret <16 x i32> %B -} - -define <2 x i64> @zext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_16i8_to_2i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_16i8_to_2i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_16i8_to_2i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: zext_16i8_to_2i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> - %C = zext <2 x i8> %B to <2 x i64> - ret <2 x i64> %C -} - -define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_16i8_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_16i8_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_16i8_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psrld $16, %xmm0 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_16i8_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_16i8_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_16i8_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: retq -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> - %C = zext <4 x i8> %B to <4 x i64> - ret <4 x i64> %C -} - -define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_16i8_to_8i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_16i8_to_8i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_16i8_to_8i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrld $16, %xmm1 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psrlq $48, %xmm0 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_16i8_to_8i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_16i8_to_8i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_16i8_to_8i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: retq -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> - %C = zext <8 x i8> %B to <8 x i64> - ret <8 x i64> %C -} - -define <4 x i32> @zext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_8i16_to_4i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_8i16_to_4i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_8i16_to_4i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: retq -; -; AVX-LABEL: zext_8i16_to_4i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX-NEXT: retq -entry: - %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> - %C = zext <4 x i16> %B to <4 x i32> - ret <4 x i32> %C -} - -define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_8i16_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_8i16_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_8i16_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_8i16_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_8i16_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_8i16_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: retq -entry: - %B = zext <8 x i16> %A to <8 x i32> - ret <8 x i32>%B -} - -define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_16i16_to_16i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_16i16_to_16i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_16i16_to_16i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_16i16_to_16i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_16i16_to_16i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_16i16_to_16i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; AVX512-NEXT: retq -entry: - %B = zext <16 x i16> %A to <16 x i32> - ret <16 x i32> %B -} - -define <2 x i64> @zext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_8i16_to_2i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_8i16_to_2i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_8i16_to_2i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: zext_8i16_to_2i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX-NEXT: retq -entry: - %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> - %C = zext <2 x i16> %B to <2 x i64> - ret <2 x i64> %C -} - -define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_8i16_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_8i16_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_8i16_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_8i16_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_8i16_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_8i16_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512-NEXT: retq -entry: - %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> - %C = zext <4 x i16> %B to <4 x i64> - ret <4 x i64> %C -} - -define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_8i16_to_8i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_8i16_to_8i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: pxor %xmm4, %xmm4 -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_8i16_to_8i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: movdqa %xmm4, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_8i16_to_8i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_8i16_to_8i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_8i16_to_8i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: retq -entry: - %B = zext <8 x i16> %A to <8 x i64> - ret <8 x i64> %B -} - -define <2 x i64> @zext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_4i32_to_2i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_4i32_to_2i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_4i32_to_2i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: retq -; -; AVX-LABEL: zext_4i32_to_2i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX-NEXT: retq -entry: - %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> - %C = zext <2 x i32> %B to <2 x i64> - ret <2 x i64> %C -} - -define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_4i32_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_4i32_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movaps %xmm0, %xmm1 -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_4i32_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_4i32_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_4i32_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_4i32_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: retq -entry: - %B = zext <4 x i32> %A to <4 x i64> - ret <4 x i64>%B -} - -define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_8i32_to_8i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movaps %xmm1, %xmm3 -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: xorps %xmm4, %xmm4 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: movaps %xmm3, %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_8i32_to_8i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movaps %xmm1, %xmm3 -; SSSE3-NEXT: movaps %xmm0, %xmm1 -; SSSE3-NEXT: xorps %xmm4, %xmm4 -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSSE3-NEXT: movaps %xmm3, %xmm2 -; SSSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSSE3-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_8i32_to_8i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: movdqa %xmm5, %xmm0 -; SSE41-NEXT: movdqa %xmm4, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_8i32_to_8i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 -; AVX1-NEXT: vmovaps %ymm2, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_8i32_to_8i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vmovdqa %ymm2, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_8i32_to_8i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; AVX512-NEXT: retq -entry: - %B = zext <8 x i32> %A to <8 x i64> - ret <8 x i64>%B -} - -define <2 x i64> @load_zext_2i8_to_2i64(<2 x i8> *%ptr) { -; SSE2-LABEL: load_zext_2i8_to_2i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movzwl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_2i8_to_2i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movzwl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_2i8_to_2i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: load_zext_2i8_to_2i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX-NEXT: retq -entry: - %X = load <2 x i8>, <2 x i8>* %ptr - %Y = zext <2 x i8> %X to <2 x i64> - ret <2 x i64> %Y -} - -define <4 x i32> @load_zext_4i8_to_4i32(<4 x i8> *%ptr) { -; SSE2-LABEL: load_zext_4i8_to_4i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_4i8_to_4i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_4i8_to_4i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: load_zext_4i8_to_4i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX-NEXT: retq -entry: - %X = load <4 x i8>, <4 x i8>* %ptr - %Y = zext <4 x i8> %X to <4 x i32> - ret <4 x i32> %Y -} - -define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) { -; SSE2-LABEL: load_zext_4i8_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_4i8_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_4i8_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_zext_4i8_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_zext_4i8_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_zext_4i8_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: retq -entry: - %X = load <4 x i8>, <4 x i8>* %ptr - %Y = zext <4 x i8> %X to <4 x i64> - ret <4 x i64> %Y -} - -define <8 x i16> @load_zext_8i8_to_8i16(<8 x i8> *%ptr) { -; SSE2-LABEL: load_zext_8i8_to_8i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_8i8_to_8i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_8i8_to_8i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SSE41-NEXT: retq -; -; AVX-LABEL: load_zext_8i8_to_8i16: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX-NEXT: retq -entry: - %X = load <8 x i8>, <8 x i8>* %ptr - %Y = zext <8 x i8> %X to <8 x i16> - ret <8 x i16> %Y -} - -define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) { -; SSE2-LABEL: load_zext_8i8_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_8i8_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_8i8_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_zext_8i8_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_zext_8i8_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_zext_8i8_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX512-NEXT: retq -entry: - %X = load <8 x i8>, <8 x i8>* %ptr - %Y = zext <8 x i8> %X to <8 x i32> - ret <8 x i32> %Y -} - -define <8 x i32> @load_zext_16i8_to_8i32(<16 x i8> *%ptr) { -; SSE2-LABEL: load_zext_16i8_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_16i8_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_16i8_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_zext_16i8_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_zext_16i8_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_zext_16i8_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX512-NEXT: retq -entry: - %X = load <16 x i8>, <16 x i8>* %ptr - %Y = shufflevector <16 x i8> %X, <16 x i8> undef, <8 x i32> - %Z = zext <8 x i8> %Y to <8 x i32> - ret <8 x i32> %Z -} - -define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) { -; SSE2-LABEL: load_zext_8i8_to_8i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_8i8_to_8i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; SSSE3-NEXT: movdqa %xmm3, %xmm0 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_8i8_to_8i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_zext_8i8_to_8i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_zext_8i8_to_8i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_zext_8i8_to_8i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: retq -entry: - %X = load <8 x i8>, <8 x i8>* %ptr - %Y = zext <8 x i8> %X to <8 x i64> - ret <8 x i64> %Y -} - -define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) { -; SSE2-LABEL: load_zext_16i8_to_16i16: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_16i8_to_16i16: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_16i8_to_16i16: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_zext_16i8_to_16i16: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_zext_16i8_to_16i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_zext_16i8_to_16i16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512-NEXT: retq -entry: - %X = load <16 x i8>, <16 x i8>* %ptr - %Y = zext <16 x i8> %X to <16 x i16> - ret <16 x i16> %Y -} - -define <2 x i64> @load_zext_2i16_to_2i64(<2 x i16> *%ptr) { -; SSE2-LABEL: load_zext_2i16_to_2i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_2i16_to_2i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_2i16_to_2i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: retq -; -; AVX-LABEL: load_zext_2i16_to_2i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX-NEXT: retq -entry: - %X = load <2 x i16>, <2 x i16>* %ptr - %Y = zext <2 x i16> %X to <2 x i64> - ret <2 x i64> %Y -} - -define <4 x i32> @load_zext_4i16_to_4i32(<4 x i16> *%ptr) { -; SSE2-LABEL: load_zext_4i16_to_4i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_4i16_to_4i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_4i16_to_4i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE41-NEXT: retq -; -; AVX-LABEL: load_zext_4i16_to_4i32: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX-NEXT: retq -entry: - %X = load <4 x i16>, <4 x i16>* %ptr - %Y = zext <4 x i16> %X to <4 x i32> - ret <4 x i32> %Y -} - -define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) { -; SSE2-LABEL: load_zext_4i16_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_4i16_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_4i16_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_zext_4i16_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_zext_4i16_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_zext_4i16_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX512-NEXT: retq -entry: - %X = load <4 x i16>, <4 x i16>* %ptr - %Y = zext <4 x i16> %X to <4 x i64> - ret <4 x i64> %Y -} - -define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) { -; SSE2-LABEL: load_zext_8i16_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_8i16_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_8i16_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_zext_8i16_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_zext_8i16_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_zext_8i16_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512-NEXT: retq -entry: - %X = load <8 x i16>, <8 x i16>* %ptr - %Y = zext <8 x i16> %X to <8 x i32> - ret <8 x i32> %Y -} - -define <2 x i64> @load_zext_2i32_to_2i64(<2 x i32> *%ptr) { -; SSE2-LABEL: load_zext_2i32_to_2i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_2i32_to_2i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_2i32_to_2i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; SSE41-NEXT: retq -; -; AVX-LABEL: load_zext_2i32_to_2i64: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX-NEXT: retq -entry: - %X = load <2 x i32>, <2 x i32>* %ptr - %Y = zext <2 x i32> %X to <2 x i64> - ret <2 x i64> %Y -} - -define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) { -; SSE2-LABEL: load_zext_4i32_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movaps (%rdi), %xmm1 -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movaps %xmm1, %xmm0 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: load_zext_4i32_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movaps (%rdi), %xmm1 -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: movaps %xmm1, %xmm0 -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: load_zext_4i32_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: load_zext_4i32_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load_zext_4i32_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: load_zext_4i32_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX512-NEXT: retq -entry: - %X = load <4 x i32>, <4 x i32>* %ptr - %Y = zext <4 x i32> %X to <4 x i64> - ret <4 x i64> %Y -} - -define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) { -; SSE2-LABEL: zext_8i8_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_8i8_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_8i8_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_8i8_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_8i8_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_8i8_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: retq -entry: - %t = zext <8 x i8> %z to <8 x i32> - ret <8 x i32> %t -} - -define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: shuf_zext_8i16_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuf_zext_8i16_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuf_zext_8i16_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuf_zext_8i16_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuf_zext_8i16_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: shuf_zext_8i16_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: retq -entry: - %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> - %Z = bitcast <16 x i16> %B to <8 x i32> - ret <8 x i32> %Z -} - -define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: shuf_zext_4i32_to_4i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuf_zext_4i32_to_4i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movaps %xmm0, %xmm1 -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuf_zext_4i32_to_4i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuf_zext_4i32_to_4i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuf_zext_4i32_to_4i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: shuf_zext_4i32_to_4i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: retq -entry: - %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> - %Z = bitcast <8 x i32> %B to <4 x i64> - ret <4 x i64> %Z -} - -define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) { -; SSE2-LABEL: shuf_zext_8i8_to_8i32: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuf_zext_8i8_to_8i32: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuf_zext_8i8_to_8i32: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuf_zext_8i8_to_8i32: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuf_zext_8i8_to_8i32: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: shuf_zext_8i8_to_8i32: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: retq -entry: - %B = shufflevector <8 x i8> %A, <8 x i8> zeroinitializer, <32 x i32> - %Z = bitcast <32 x i8> %B to <8 x i32> - ret <8 x i32> %Z -} - -define <2 x i64> @shuf_zext_16i8_to_2i64_offset6(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: shuf_zext_16i8_to_2i64_offset6: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuf_zext_16i8_to_2i64_offset6: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuf_zext_16i8_to_2i64_offset6: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: psrlq $48, %xmm0 -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuf_zext_16i8_to_2i64_offset6: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuf_zext_16i8_to_2i64_offset6: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: shuf_zext_16i8_to_2i64_offset6: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: shuf_zext_16i8_to_2i64_offset6: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpsrlq $48, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: shuf_zext_16i8_to_2i64_offset6: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: retq -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <16 x i32> - %Z = bitcast <16 x i8> %B to <2 x i64> - ret <2 x i64> %Z -} - -define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: shuf_zext_16i8_to_4i64_offset11: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuf_zext_16i8_to_4i64_offset11: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,zero,zero,zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[13],zero,zero,zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuf_zext_16i8_to_4i64_offset11: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuf_zext_16i8_to_4i64_offset11: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: shuf_zext_16i8_to_4i64_offset11: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; AVX512-NEXT: retq -entry: - %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <32 x i32> - %Z = bitcast <32 x i8> %B to <4 x i64> - ret <4 x i64> %Z -} - -define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: shuf_zext_8i16_to_2i64_offset6: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuf_zext_8i16_to_2i64_offset6: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuf_zext_8i16_to_2i64_offset6: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuf_zext_8i16_to_2i64_offset6: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuf_zext_8i16_to_2i64_offset6: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: shuf_zext_8i16_to_2i64_offset6: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: shuf_zext_8i16_to_2i64_offset6: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: shuf_zext_8i16_to_2i64_offset6: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: retq -entry: - %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> - %Z = bitcast <8 x i16> %B to <2 x i64> - ret <2 x i64> %Z -} - -define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: shuf_zext_8i16_to_4i64_offset2: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuf_zext_8i16_to_4i64_offset2: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuf_zext_8i16_to_4i64_offset2: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; SSE41-NEXT: movdqa %xmm2, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuf_zext_8i16_to_4i64_offset2: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] -; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512-NEXT: retq -entry: - %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> - %Z = bitcast <16 x i16> %B to <4 x i64> - ret <4 x i64> %Z -} - -define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: shuf_zext_8i16_to_4i32_offset1: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuf_zext_8i16_to_4i32_offset1: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuf_zext_8i16_to_4i32_offset1: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuf_zext_8i16_to_4i32_offset1: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: shuf_zext_8i16_to_4i32_offset1: -; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: shuf_zext_8i16_to_4i32_offset1: -; AVX2-FAST: # %bb.0: # %entry -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: shuf_zext_8i16_to_4i32_offset1: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: shuf_zext_8i16_to_4i32_offset1: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero -; AVX512BW-NEXT: retq -entry: - %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> - %Z = bitcast <8 x i16> %B to <4 x i32> - ret <4 x i32> %Z -} - -define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: shuf_zext_8i16_to_8i32_offset3: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuf_zext_8i16_to_8i32_offset3: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuf_zext_8i16_to_8i32_offset3: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuf_zext_8i16_to_8i32_offset3: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: shuf_zext_8i16_to_8i32_offset3: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: retq -entry: - %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> - %Z = bitcast <16 x i16> %B to <8 x i32> - ret <8 x i32> %Z -} - -define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: shuf_zext_16i16_to_8i32_offset8: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuf_zext_16i16_to_8i32_offset8: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: shuf_zext_16i16_to_8i32_offset8: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: retq -entry: - %B = shufflevector <16 x i16> %A, <16 x i16> zeroinitializer, <16 x i32> - %Z = bitcast <16 x i16> %B to <8 x i32> - ret <8 x i32> %Z -} - -define <2 x i64> @shuf_zext_4i32_to_2i64_offset2(<4 x i32> %A) nounwind uwtable readnone ssp { -; SSE-LABEL: shuf_zext_4i32_to_2i64_offset2: -; SSE: # %bb.0: # %entry -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: retq -; -; AVX-LABEL: shuf_zext_4i32_to_2i64_offset2: -; AVX: # %bb.0: # %entry -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX-NEXT: retq -entry: - %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <4 x i32> - %Z = bitcast <4 x i32> %B to <2 x i64> - ret <2 x i64> %Z -} - -define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable readnone ssp { -; SSE2-LABEL: shuf_zext_4i32_to_4i64_offset1: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,0,4294967295,0] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuf_zext_4i32_to_4i64_offset1: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,0,4294967295,0] -; SSSE3-NEXT: pand %xmm1, %xmm0 -; SSSE3-NEXT: psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuf_zext_4i32_to_4i64_offset1: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: pxor %xmm0, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7] -; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE41-NEXT: retq -; -; AVX1-LABEL: shuf_zext_4i32_to_4i64_offset1: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] -; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: shuf_zext_4i32_to_4i64_offset1: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3] -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: retq -entry: - %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> - %Z = bitcast <8 x i32> %B to <4 x i64> - ret <4 x i64> %Z -} - -define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) { -; SSE2-LABEL: zext_32i8_to_32i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm1, 112(%rdi) -; SSE2-NEXT: movdqa %xmm4, 96(%rdi) -; SSE2-NEXT: movdqa %xmm6, 80(%rdi) -; SSE2-NEXT: movdqa %xmm7, 64(%rdi) -; SSE2-NEXT: movdqa %xmm0, 48(%rdi) -; SSE2-NEXT: movdqa %xmm5, 32(%rdi) -; SSE2-NEXT: movdqa %xmm3, 16(%rdi) -; SSE2-NEXT: movdqa %xmm8, (%rdi) -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_32i8_to_32i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq %rdi, %rax -; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: movdqa %xmm0, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm3, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] -; SSSE3-NEXT: movdqa %xmm0, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm6, %xmm7 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm1, 112(%rdi) -; SSSE3-NEXT: movdqa %xmm4, 96(%rdi) -; SSSE3-NEXT: movdqa %xmm6, 80(%rdi) -; SSSE3-NEXT: movdqa %xmm7, 64(%rdi) -; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) -; SSSE3-NEXT: movdqa %xmm5, 32(%rdi) -; SSSE3-NEXT: movdqa %xmm3, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm8, (%rdi) -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_32i8_to_32i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE41-NEXT: movdqa %xmm1, 112(%rdi) -; SSE41-NEXT: movdqa %xmm7, 96(%rdi) -; SSE41-NEXT: movdqa %xmm6, 80(%rdi) -; SSE41-NEXT: movdqa %xmm5, 64(%rdi) -; SSE41-NEXT: movdqa %xmm0, 48(%rdi) -; SSE41-NEXT: movdqa %xmm4, 32(%rdi) -; SSE41-NEXT: movdqa %xmm3, 16(%rdi) -; SSE41-NEXT: movdqa %xmm2, (%rdi) -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_32i8_to_32i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,3] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1] -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 -; AVX1-NEXT: vmovaps %ymm4, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_32i8_to_32i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vmovdqa %ymm4, %ymm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_32i8_to_32i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512-NEXT: retq - %res = zext <32 x i8>%x to <32 x i32> - ret <32 x i32> %res -} - -define <2 x i32> @zext_2i8_to_2i32(<2 x i8>* %addr) { -; SSE2-LABEL: zext_2i8_to_2i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movzwl (%rdi), %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: paddd %xmm0, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_2i8_to_2i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movzwl (%rdi), %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: paddd %xmm0, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_2i8_to_2i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movzwl (%rdi), %eax -; SSE41-NEXT: movd %eax, %xmm0 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; SSE41-NEXT: paddd %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: zext_2i8_to_2i32: -; AVX: # %bb.0: -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: retq - %x = load <2 x i8>, <2 x i8>* %addr, align 1 - %y = zext <2 x i8> %x to <2 x i32> - %z = add <2 x i32>%y, %y - ret <2 x i32>%z -} - -define <4 x i32> @zext_4i17_to_4i32(<4 x i17>* %ptr) { -; SSE2-LABEL: zext_4i17_to_4i32: -; SSE2: # %bb.0: -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: movq %rax, %rcx -; SSE2-NEXT: shrq $17, %rcx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movl 8(%rdi), %ecx -; SSE2-NEXT: shll $13, %ecx -; SSE2-NEXT: movq %rax, %rdx -; SSE2-NEXT: shrq $51, %rdx -; SSE2-NEXT: orl %ecx, %edx -; SSE2-NEXT: movd %edx, %xmm1 -; SSE2-NEXT: shrq $34, %rax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_4i17_to_4i32: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movq (%rdi), %rax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: movq %rax, %rcx -; SSSE3-NEXT: shrq $17, %rcx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movl 8(%rdi), %ecx -; SSSE3-NEXT: shll $13, %ecx -; SSSE3-NEXT: movq %rax, %rdx -; SSSE3-NEXT: shrq $51, %rdx -; SSSE3-NEXT: orl %ecx, %edx -; SSSE3-NEXT: movd %edx, %xmm1 -; SSSE3-NEXT: shrq $34, %rax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_4i17_to_4i32: -; SSE41: # %bb.0: -; SSE41-NEXT: movl 8(%rdi), %eax -; SSE41-NEXT: shll $13, %eax -; SSE41-NEXT: movq (%rdi), %rcx -; SSE41-NEXT: movq %rcx, %rdx -; SSE41-NEXT: shrq $51, %rdx -; SSE41-NEXT: orl %eax, %edx -; SSE41-NEXT: movq %rcx, %rax -; SSE41-NEXT: shrq $17, %rax -; SSE41-NEXT: movd %ecx, %xmm0 -; SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; SSE41-NEXT: shrq $34, %rcx -; SSE41-NEXT: pinsrd $2, %ecx, %xmm0 -; SSE41-NEXT: pinsrd $3, %edx, %xmm0 -; SSE41-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_4i17_to_4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: movl 8(%rdi), %eax -; AVX1-NEXT: shll $13, %eax -; AVX1-NEXT: movq (%rdi), %rcx -; AVX1-NEXT: movq %rcx, %rdx -; AVX1-NEXT: shrq $51, %rdx -; AVX1-NEXT: orl %eax, %edx -; AVX1-NEXT: movq %rcx, %rax -; AVX1-NEXT: shrq $17, %rax -; AVX1-NEXT: vmovd %ecx, %xmm0 -; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX1-NEXT: shrq $34, %rcx -; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_4i17_to_4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: movl 8(%rdi), %eax -; AVX2-NEXT: shll $13, %eax -; AVX2-NEXT: movq (%rdi), %rcx -; AVX2-NEXT: movq %rcx, %rdx -; AVX2-NEXT: shrq $51, %rdx -; AVX2-NEXT: orl %eax, %edx -; AVX2-NEXT: movq %rcx, %rax -; AVX2-NEXT: shrq $17, %rax -; AVX2-NEXT: vmovd %ecx, %xmm0 -; AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX2-NEXT: shrq $34, %rcx -; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [131071,131071,131071,131071] -; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_4i17_to_4i32: -; AVX512: # %bb.0: -; AVX512-NEXT: movl 8(%rdi), %eax -; AVX512-NEXT: shll $13, %eax -; AVX512-NEXT: movq (%rdi), %rcx -; AVX512-NEXT: movq %rcx, %rdx -; AVX512-NEXT: shrq $51, %rdx -; AVX512-NEXT: orl %eax, %edx -; AVX512-NEXT: movq %rcx, %rax -; AVX512-NEXT: shrq $17, %rax -; AVX512-NEXT: vmovd %ecx, %xmm0 -; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 -; AVX512-NEXT: shrq $34, %rcx -; AVX512-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0 -; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [131071,131071,131071,131071] -; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: retq - %a = load <4 x i17>, <4 x i17>* %ptr - %b = zext <4 x i17> %a to <4 x i32> - ret <4 x i32> %b -} - -define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { -; SSE2-LABEL: zext_8i6_to_8i64: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movd %edi, %xmm0 -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE2-NEXT: paddw {{.*}}(%rip), %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,63] -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] -; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: zext_8i6_to_8i64: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movd %edi, %xmm0 -; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm3 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [63,63] -; SSSE3-NEXT: pand %xmm4, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: pand %xmm4, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: pand %xmm4, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3] -; SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7] -; SSSE3-NEXT: pand %xmm4, %xmm3 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: zext_8i6_to_8i64: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: movd %edi, %xmm0 -; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE41-NEXT: paddw {{.*}}(%rip), %xmm3 -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [63,63] -; SSE41-NEXT: pand %xmm4, %xmm0 -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; SSE41-NEXT: pand %xmm4, %xmm1 -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; SSE41-NEXT: pand %xmm4, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] -; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero -; SSE41-NEXT: pand %xmm4, %xmm3 -; SSE41-NEXT: retq -; -; AVX1-LABEL: zext_8i6_to_8i64: -; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vmovd %edi, %xmm0 -; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1] -; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 -; AVX1-NEXT: retq -; -; AVX2-LABEL: zext_8i6_to_8i64: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: zext_8i6_to_8i64: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vmovd %edi, %xmm0 -; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512-NEXT: retq -entry: - %a = trunc i32 %x to i6 - %b = insertelement <8 x i6> undef, i6 %a, i32 0 - %c = shufflevector <8 x i6> %b, <8 x i6> undef, <8 x i32> zeroinitializer - %d = add <8 x i6> %c, - %e = zext <8 x i6> %d to <8 x i64> - ret <8 x i64> %e -} - -define <4 x i64> @splatshuf_zext_v4i64(<4 x i32> %x) { -; SSE2-LABEL: splatshuf_zext_v4i64: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: splatshuf_zext_v4i64: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: splatshuf_zext_v4i64: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatshuf_zext_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatshuf_zext_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: splatshuf_zext_v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0 -; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512-NEXT: retq - %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer - %ext = zext <4 x i32> %shuf to <4 x i64> - ret <4 x i64> %ext -} - -define <8 x i32> @splatshuf_zext_v8i32_matching_undefs(<8 x i16> %x) { -; SSE2-LABEL: splatshuf_zext_v8i32_matching_undefs: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7] -; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: splatshuf_zext_v8i32_matching_undefs: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[u,u],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: splatshuf_zext_v8i32_matching_undefs: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,8,9,10,11,12,13,14,15] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatshuf_zext_v8i32_matching_undefs: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[6,7],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatshuf_zext_v8i32_matching_undefs: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15] -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: splatshuf_zext_v8i32_matching_undefs: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15] -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: retq - %shuf = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> - %ext = zext <8 x i16> %shuf to <8 x i32> - ret <8 x i32> %ext -} - -define <8 x i32> @splatshuf_zext_v8i32_unmatched_undef(<8 x i16> %x) { -; SSE2-LABEL: splatshuf_zext_v8i32_unmatched_undef: -; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: splatshuf_zext_v8i32_unmatched_undef: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: splatshuf_zext_v8i32_unmatched_undef: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,14,15,6,7,12,13,14,15] -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatshuf_zext_v8i32_unmatched_undef: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatshuf_zext_v8i32_unmatched_undef: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: splatshuf_zext_v8i32_unmatched_undef: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15] -; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: retq - %shuf = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> - %ext = zext <8 x i16> %shuf to <8 x i32> - ret <8 x i32> %ext -} - -define <16 x i16> @splatshuf_zext_v16i16(<16 x i8> %x) { -; SSE2-LABEL: splatshuf_zext_v16i16: -; SSE2: # %bb.0: -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: splatshuf_zext_v16i16: -; SSSE3: # %bb.0: -; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: splatshuf_zext_v16i16: -; SSE41: # %bb.0: -; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: splatshuf_zext_v16i16: -; AVX1: # %bb.0: -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: splatshuf_zext_v16i16: -; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] -; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX2-NEXT: retq -; -; AVX512-LABEL: splatshuf_zext_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14] -; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512-NEXT: retq - %shuf = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> - %ext = zext <16 x i8> %shuf to <16 x i16> - ret <16 x i16> %ext -} diff --git a/test/CodeGen/X86/widen_cast-4.ll b/test/CodeGen/X86/widen_cast-4.ll index 949b6dd9db0..f317d4b5913 100644 --- a/test/CodeGen/X86/widen_cast-4.ll +++ b/test/CodeGen/X86/widen_cast-4.ll @@ -1,45 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=NARROW -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=WIDE +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=WIDE ; FIXME: We shouldn't require both a movd and an insert in the wide version. define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind { -; NARROW-LABEL: update: -; NARROW: # %bb.0: # %entry -; NARROW-NEXT: subl $12, %esp -; NARROW-NEXT: movl $0, (%esp) -; NARROW-NEXT: pcmpeqd %xmm0, %xmm0 -; NARROW-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] -; NARROW-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; NARROW-NEXT: .p2align 4, 0x90 -; NARROW-NEXT: .LBB0_1: # %forcond -; NARROW-NEXT: # =>This Inner Loop Header: Depth=1 -; NARROW-NEXT: movl (%esp), %eax -; NARROW-NEXT: cmpl {{[0-9]+}}(%esp), %eax -; NARROW-NEXT: jge .LBB0_3 -; NARROW-NEXT: # %bb.2: # %forbody -; NARROW-NEXT: # in Loop: Header=BB0_1 Depth=1 -; NARROW-NEXT: movl (%esp), %eax -; NARROW-NEXT: leal (,%eax,8), %ecx -; NARROW-NEXT: movl {{[0-9]+}}(%esp), %edx -; NARROW-NEXT: addl %ecx, %edx -; NARROW-NEXT: movl %edx, {{[0-9]+}}(%esp) -; NARROW-NEXT: addl {{[0-9]+}}(%esp), %ecx -; NARROW-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; NARROW-NEXT: movq {{.*#+}} xmm3 = mem[0],zero -; NARROW-NEXT: psubb %xmm0, %xmm3 -; NARROW-NEXT: psrlw $2, %xmm3 -; NARROW-NEXT: pand %xmm1, %xmm3 -; NARROW-NEXT: pxor %xmm2, %xmm3 -; NARROW-NEXT: psubb %xmm2, %xmm3 -; NARROW-NEXT: movq %xmm3, (%edx,%eax,8) -; NARROW-NEXT: incl (%esp) -; NARROW-NEXT: jmp .LBB0_1 -; NARROW-NEXT: .LBB0_3: # %afterfor -; NARROW-NEXT: addl $12, %esp -; NARROW-NEXT: retl -; ; WIDE-LABEL: update: ; WIDE: # %bb.0: # %entry ; WIDE-NEXT: subl $12, %esp diff --git a/test/CodeGen/X86/widen_conversions.ll b/test/CodeGen/X86/widen_conversions.ll index acd8c78fa2d..d9d43bb4ad2 100644 --- a/test/CodeGen/X86/widen_conversions.ll +++ b/test/CodeGen/X86/widen_conversions.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 define <4 x i32> @zext_v4i8_to_v4i32(<4 x i8>* %ptr) { ; X86-LABEL: zext_v4i8_to_v4i32: diff --git a/test/CodeGen/X86/widen_mul.ll b/test/CodeGen/X86/widen_mul.ll index b3ccb961c55..783ca948b0a 100644 --- a/test/CodeGen/X86/widen_mul.ll +++ b/test/CodeGen/X86/widen_mul.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW ; Test multiplies of various narrow types. -- 2.40.0