; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s
; Check that we perform a scalar XOR on i32.
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQNOVL --check-prefix=AVX512DQ
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=avx512vl,avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
-
-
-define <16 x float> @sitof32(<16 x i32> %a) nounwind {
-; ALL-LABEL: sitof32:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT: retq
- %b = sitofp <16 x i32> %a to <16 x float>
- ret <16 x float> %b
-}
-
-define <8 x double> @sltof864(<8 x i64> %a) {
-; NODQ-LABEL: sltof864:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: sltof864:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtqq2pd %zmm0, %zmm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: sltof864:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT: retq
- %b = sitofp <8 x i64> %a to <8 x double>
- ret <8 x double> %b
-}
-
-define <4 x double> @slto4f64(<4 x i64> %a) {
-; NODQ-LABEL: slto4f64:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: slto4f64:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: slto4f64:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT: retq
- %b = sitofp <4 x i64> %a to <4 x double>
- ret <4 x double> %b
-}
-
-define <2 x double> @slto2f64(<2 x i64> %a) {
-; NODQ-LABEL: slto2f64:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: slto2f64:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: slto2f64:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT: vzeroupper
-; DQNOVL-NEXT: retq
- %b = sitofp <2 x i64> %a to <2 x double>
- ret <2 x double> %b
-}
-
-define <2 x float> @sltof2f32(<2 x i64> %a) {
-; NOVLDQ-LABEL: sltof2f32:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: vpextrq $1, %xmm0, %rax
-; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; NOVLDQ-NEXT: vmovq %xmm0, %rax
-; NOVLDQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; NOVLDQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: sltof2f32:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: sltof2f32:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vpextrq $1, %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; VLNODQ-NEXT: vmovq %xmm0, %rax
-; VLNODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; VLNODQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; VLNODQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: sltof2f32:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; DQNOVL-NEXT: vzeroupper
-; DQNOVL-NEXT: retq
- %b = sitofp <2 x i64> %a to <2 x float>
- ret <2 x float>%b
-}
-
-define <4 x float> @slto4f32_mem(<4 x i64>* %a) {
-; NODQ-LABEL: slto4f32_mem:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vmovdqu (%rdi), %xmm0
-; NODQ-NEXT: vmovdqu 16(%rdi), %xmm1
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: slto4f32_mem:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: slto4f32_mem:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vmovups (%rdi), %ymm0
-; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; DQNOVL-NEXT: vzeroupper
-; DQNOVL-NEXT: retq
- %a1 = load <4 x i64>, <4 x i64>* %a, align 8
- %b = sitofp <4 x i64> %a1 to <4 x float>
- ret <4 x float>%b
-}
-
-define <4 x i64> @f64to4sl(<4 x double> %a) {
-; NODQ-LABEL: f64to4sl:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vextractf128 $1, %ymm0, %xmm1
-; NODQ-NEXT: vcvttsd2si %xmm1, %rax
-; NODQ-NEXT: vmovq %rax, %xmm2
-; NODQ-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; NODQ-NEXT: vcvttsd2si %xmm1, %rax
-; NODQ-NEXT: vmovq %rax, %xmm1
-; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; NODQ-NEXT: vcvttsd2si %xmm0, %rax
-; NODQ-NEXT: vmovq %rax, %xmm2
-; NODQ-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; NODQ-NEXT: vcvttsd2si %xmm0, %rax
-; NODQ-NEXT: vmovq %rax, %xmm0
-; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: f64to4sl:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: f64to4sl:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; DQNOVL-NEXT: vcvttpd2qq %zmm0, %zmm0
-; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT: retq
- %b = fptosi <4 x double> %a to <4 x i64>
- ret <4 x i64> %b
-}
-
-define <4 x i64> @f32to4sl(<4 x float> %a) {
-; NODQ-LABEL: f32to4sl:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; NODQ-NEXT: vcvttss2si %xmm1, %rax
-; NODQ-NEXT: vmovq %rax, %xmm1
-; NODQ-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; NODQ-NEXT: vcvttss2si %xmm2, %rax
-; NODQ-NEXT: vmovq %rax, %xmm2
-; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; NODQ-NEXT: vcvttss2si %xmm0, %rax
-; NODQ-NEXT: vmovq %rax, %xmm2
-; NODQ-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; NODQ-NEXT: vcvttss2si %xmm0, %rax
-; NODQ-NEXT: vmovq %rax, %xmm0
-; NODQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; NODQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: f32to4sl:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: f32to4sl:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; DQNOVL-NEXT: vcvttps2qq %ymm0, %zmm0
-; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT: retq
- %b = fptosi <4 x float> %a to <4 x i64>
- ret <4 x i64> %b
-}
-
-define <4 x float> @slto4f32(<4 x i64> %a) {
-; NODQ-LABEL: slto4f32:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT: vzeroupper
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: slto4f32:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
-; VLDQ-NEXT: vzeroupper
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: slto4f32:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; DQNOVL-NEXT: vzeroupper
-; DQNOVL-NEXT: retq
- %b = sitofp <4 x i64> %a to <4 x float>
- ret <4 x float> %b
-}
-
-define <4 x float> @ulto4f32(<4 x i64> %a) {
-; NODQ-LABEL: ulto4f32:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT: vzeroupper
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: ulto4f32:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
-; VLDQ-NEXT: vzeroupper
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: ulto4f32:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; DQNOVL-NEXT: vzeroupper
-; DQNOVL-NEXT: retq
- %b = uitofp <4 x i64> %a to <4 x float>
- ret <4 x float> %b
-}
-
-define <8 x double> @ulto8f64(<8 x i64> %a) {
-; NODQ-LABEL: ulto8f64:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1
-; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1
-; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0
-; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; NODQ-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; NODQ-NEXT: vaddpd %zmm0, %zmm1, %zmm0
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: ulto8f64:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: ulto8f64:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT: retq
- %b = uitofp <8 x i64> %a to <8 x double>
- ret <8 x double> %b
-}
-
-define <16 x double> @ulto16f64(<16 x i64> %a) {
-; NODQ-LABEL: ulto16f64:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295]
-; NODQ-NEXT: vpandq %zmm2, %zmm0, %zmm3
-; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; NODQ-NEXT: vporq %zmm4, %zmm3, %zmm3
-; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0
-; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; NODQ-NEXT: vporq %zmm5, %zmm0, %zmm0
-; NODQ-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; NODQ-NEXT: vsubpd %zmm6, %zmm0, %zmm0
-; NODQ-NEXT: vaddpd %zmm0, %zmm3, %zmm0
-; NODQ-NEXT: vpandq %zmm2, %zmm1, %zmm2
-; NODQ-NEXT: vporq %zmm4, %zmm2, %zmm2
-; NODQ-NEXT: vpsrlq $32, %zmm1, %zmm1
-; NODQ-NEXT: vporq %zmm5, %zmm1, %zmm1
-; NODQ-NEXT: vsubpd %zmm6, %zmm1, %zmm1
-; NODQ-NEXT: vaddpd %zmm1, %zmm2, %zmm1
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: ulto16f64:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; VLDQ-NEXT: vcvtuqq2pd %zmm1, %zmm1
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: ulto16f64:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT: vcvtuqq2pd %zmm1, %zmm1
-; DQNOVL-NEXT: retq
- %b = uitofp <16 x i64> %a to <16 x double>
- ret <16 x double> %b
-}
-
-define <16 x i32> @f64to16si(<16 x float> %a) nounwind {
-; ALL-LABEL: f64to16si:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
-; ALL-NEXT: retq
- %b = fptosi <16 x float> %a to <16 x i32>
- ret <16 x i32> %b
-}
-
-define <16 x i8> @f32to16sc(<16 x float> %f) {
-; ALL-LABEL: f32to16sc:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
-; ALL-NEXT: vpmovdb %zmm0, %xmm0
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %res = fptosi <16 x float> %f to <16 x i8>
- ret <16 x i8> %res
-}
-
-define <16 x i16> @f32to16ss(<16 x float> %f) {
-; ALL-LABEL: f32to16ss:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
-; ALL-NEXT: vpmovdw %zmm0, %ymm0
-; ALL-NEXT: retq
- %res = fptosi <16 x float> %f to <16 x i16>
- ret <16 x i16> %res
-}
-
-define <16 x i32> @f32to16ui(<16 x float> %a) nounwind {
-; ALL-LABEL: f32to16ui:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttps2udq %zmm0, %zmm0
-; ALL-NEXT: retq
- %b = fptoui <16 x float> %a to <16 x i32>
- ret <16 x i32> %b
-}
-
-define <16 x i8> @f32to16uc(<16 x float> %f) {
-; ALL-LABEL: f32to16uc:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
-; ALL-NEXT: vpmovdb %zmm0, %xmm0
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %res = fptoui <16 x float> %f to <16 x i8>
- ret <16 x i8> %res
-}
-
-define <16 x i16> @f32to16us(<16 x float> %f) {
-; ALL-LABEL: f32to16us:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttps2dq %zmm0, %zmm0
-; ALL-NEXT: vpmovdw %zmm0, %ymm0
-; ALL-NEXT: retq
- %res = fptoui <16 x float> %f to <16 x i16>
- ret <16 x i16> %res
-}
-
-define <8 x i32> @f32to8ui(<8 x float> %a) nounwind {
-; NOVL-LABEL: f32to8ui:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT: retq
-;
-; VL-LABEL: f32to8ui:
-; VL: # %bb.0:
-; VL-NEXT: vcvttps2udq %ymm0, %ymm0
-; VL-NEXT: retq
- %b = fptoui <8 x float> %a to <8 x i32>
- ret <8 x i32> %b
-}
-
-define <4 x i32> @f32to4ui(<4 x float> %a) nounwind {
-; NOVL-LABEL: f32to4ui:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; NOVL-NEXT: vcvttps2udq %zmm0, %zmm0
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVL-NEXT: vzeroupper
-; NOVL-NEXT: retq
-;
-; VL-LABEL: f32to4ui:
-; VL: # %bb.0:
-; VL-NEXT: vcvttps2udq %xmm0, %xmm0
-; VL-NEXT: retq
- %b = fptoui <4 x float> %a to <4 x i32>
- ret <4 x i32> %b
-}
-
-define <8 x i32> @f64to8ui(<8 x double> %a) nounwind {
-; ALL-LABEL: f64to8ui:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttpd2udq %zmm0, %ymm0
-; ALL-NEXT: retq
- %b = fptoui <8 x double> %a to <8 x i32>
- ret <8 x i32> %b
-}
-
-define <8 x i16> @f64to8us(<8 x double> %f) {
-; NOVL-LABEL: f64to8us:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT: vpmovdw %zmm0, %ymm0
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; NOVL-NEXT: vzeroupper
-; NOVL-NEXT: retq
-;
-; VL-LABEL: f64to8us:
-; VL: # %bb.0:
-; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT: vpmovdw %ymm0, %xmm0
-; VL-NEXT: vzeroupper
-; VL-NEXT: retq
- %res = fptoui <8 x double> %f to <8 x i16>
- ret <8 x i16> %res
-}
-
-define <8 x i8> @f64to8uc(<8 x double> %f) {
-; NOVL-LABEL: f64to8uc:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT: vpmovdb %zmm0, %xmm0
-; NOVL-NEXT: vzeroupper
-; NOVL-NEXT: retq
-;
-; VL-LABEL: f64to8uc:
-; VL: # %bb.0:
-; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT: vpmovdb %ymm0, %xmm0
-; VL-NEXT: vzeroupper
-; VL-NEXT: retq
- %res = fptoui <8 x double> %f to <8 x i8>
- ret <8 x i8> %res
-}
-
-define <4 x i32> @f64to4ui(<4 x double> %a) nounwind {
-; NOVL-LABEL: f64to4ui:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; NOVL-NEXT: vcvttpd2udq %zmm0, %ymm0
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; NOVL-NEXT: vzeroupper
-; NOVL-NEXT: retq
-;
-; VL-LABEL: f64to4ui:
-; VL: # %bb.0:
-; VL-NEXT: vcvttpd2udq %ymm0, %xmm0
-; VL-NEXT: vzeroupper
-; VL-NEXT: retq
- %b = fptoui <4 x double> %a to <4 x i32>
- ret <4 x i32> %b
-}
-
-define <8 x double> @sito8f64(<8 x i32> %a) {
-; ALL-LABEL: sito8f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT: retq
- %b = sitofp <8 x i32> %a to <8 x double>
- ret <8 x double> %b
-}
-define <8 x double> @i32to8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; KNL-LABEL: i32to8f64_mask:
-; KNL: # %bb.0:
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
-; KNL-NEXT: retq
-;
-; VLBW-LABEL: i32to8f64_mask:
-; VLBW: # %bb.0:
-; VLBW-NEXT: kmovd %edi, %k1
-; VLBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
-; VLBW-NEXT: retq
-;
-; VLNOBW-LABEL: i32to8f64_mask:
-; VLNOBW: # %bb.0:
-; VLNOBW-NEXT: kmovw %edi, %k1
-; VLNOBW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
-; VLNOBW-NEXT: retq
-;
-; DQNOVL-LABEL: i32to8f64_mask:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: kmovw %edi, %k1
-; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
-; DQNOVL-NEXT: retq
-;
-; AVX512BW-LABEL: i32to8f64_mask:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vcvtdq2pd %ymm1, %zmm0 {%k1}
-; AVX512BW-NEXT: retq
- %1 = bitcast i8 %c to <8 x i1>
- %2 = sitofp <8 x i32> %b to <8 x double>
- %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
- ret <8 x double> %3
-}
-define <8 x double> @sito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; KNL-LABEL: sito8f64_maskz:
-; KNL: # %bb.0:
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; KNL-NEXT: retq
-;
-; VLBW-LABEL: sito8f64_maskz:
-; VLBW: # %bb.0:
-; VLBW-NEXT: kmovd %edi, %k1
-; VLBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; VLBW-NEXT: retq
-;
-; VLNOBW-LABEL: sito8f64_maskz:
-; VLNOBW: # %bb.0:
-; VLNOBW-NEXT: kmovw %edi, %k1
-; VLNOBW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; VLNOBW-NEXT: retq
-;
-; DQNOVL-LABEL: sito8f64_maskz:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: kmovw %edi, %k1
-; DQNOVL-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: retq
-;
-; AVX512BW-LABEL: sito8f64_maskz:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: retq
- %1 = bitcast i8 %b to <8 x i1>
- %2 = sitofp <8 x i32> %a to <8 x double>
- %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
- ret <8 x double> %3
-}
-
-define <8 x i32> @f64to8si(<8 x double> %a) {
-; ALL-LABEL: f64to8si:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; ALL-NEXT: retq
- %b = fptosi <8 x double> %a to <8 x i32>
- ret <8 x i32> %b
-}
-
-define <8 x i16> @f64to8ss(<8 x double> %f) {
-; NOVL-LABEL: f64to8ss:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT: vpmovdw %zmm0, %ymm0
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; NOVL-NEXT: vzeroupper
-; NOVL-NEXT: retq
-;
-; VL-LABEL: f64to8ss:
-; VL: # %bb.0:
-; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT: vpmovdw %ymm0, %xmm0
-; VL-NEXT: vzeroupper
-; VL-NEXT: retq
- %res = fptosi <8 x double> %f to <8 x i16>
- ret <8 x i16> %res
-}
-
-define <8 x i8> @f64to8sc(<8 x double> %f) {
-; NOVL-LABEL: f64to8sc:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; NOVL-NEXT: vpmovdb %zmm0, %xmm0
-; NOVL-NEXT: vzeroupper
-; NOVL-NEXT: retq
-;
-; VL-LABEL: f64to8sc:
-; VL: # %bb.0:
-; VL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; VL-NEXT: vpmovdb %ymm0, %xmm0
-; VL-NEXT: vzeroupper
-; VL-NEXT: retq
- %res = fptosi <8 x double> %f to <8 x i8>
- ret <8 x i8> %res
-}
-
-define <4 x i32> @f64to4si(<4 x double> %a) {
-; ALL-LABEL: f64to4si:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttpd2dq %ymm0, %xmm0
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %b = fptosi <4 x double> %a to <4 x i32>
- ret <4 x i32> %b
-}
-
-define <16 x float> @f64to16f32(<16 x double> %b) nounwind {
-; ALL-LABEL: f64to16f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0
-; ALL-NEXT: vcvtpd2ps %zmm1, %ymm1
-; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; ALL-NEXT: retq
- %a = fptrunc <16 x double> %b to <16 x float>
- ret <16 x float> %a
-}
-
-define <4 x float> @f64to4f32(<4 x double> %b) {
-; ALL-LABEL: f64to4f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %a = fptrunc <4 x double> %b to <4 x float>
- ret <4 x float> %a
-}
-
-define <4 x float> @f64to4f32_mask(<4 x double> %b, <4 x i1> %mask) {
-; NOVLDQ-LABEL: f64to4f32_mask:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: vpslld $31, %xmm1, %xmm1
-; NOVLDQ-NEXT: vptestmd %zmm1, %zmm1, %k1
-; NOVLDQ-NEXT: vcvtpd2ps %ymm0, %xmm0
-; NOVLDQ-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT: vzeroupper
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: f64to4f32_mask:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vpslld $31, %xmm1, %xmm1
-; VLDQ-NEXT: vpmovd2m %xmm1, %k1
-; VLDQ-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
-; VLDQ-NEXT: vzeroupper
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: f64to4f32_mask:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vpslld $31, %xmm1, %xmm1
-; VLNODQ-NEXT: vptestmd %xmm1, %xmm1, %k1
-; VLNODQ-NEXT: vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
-; VLNODQ-NEXT: vzeroupper
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: f64to4f32_mask:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vpslld $31, %xmm1, %xmm1
-; DQNOVL-NEXT: vpmovd2m %zmm1, %k1
-; DQNOVL-NEXT: vcvtpd2ps %ymm0, %xmm0
-; DQNOVL-NEXT: vmovaps %zmm0, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT: vzeroupper
-; DQNOVL-NEXT: retq
- %a = fptrunc <4 x double> %b to <4 x float>
- %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
- ret <4 x float> %c
-}
-
-define <4 x float> @f64tof32_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
-; ALL-LABEL: f64tof32_inreg:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtsd2ss %xmm0, %xmm1, %xmm0
-; ALL-NEXT: retq
- %ext = extractelement <2 x double> %a0, i32 0
- %cvt = fptrunc double %ext to float
- %res = insertelement <4 x float> %a1, float %cvt, i32 0
- ret <4 x float> %res
-}
-
-define <8 x double> @f32to8f64(<8 x float> %b) nounwind {
-; ALL-LABEL: f32to8f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtps2pd %ymm0, %zmm0
-; ALL-NEXT: retq
- %a = fpext <8 x float> %b to <8 x double>
- ret <8 x double> %a
-}
-
-define <4 x double> @f32to4f64_mask(<4 x float> %b, <4 x double> %b1, <4 x double> %a1) {
-; NOVL-LABEL: f32to4f64_mask:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2
-; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVL-NEXT: vcvtps2pd %xmm0, %ymm0
-; NOVL-NEXT: vcmpltpd %zmm2, %zmm1, %k1
-; NOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT: retq
-;
-; VL-LABEL: f32to4f64_mask:
-; VL: # %bb.0:
-; VL-NEXT: vcmpltpd %ymm2, %ymm1, %k1
-; VL-NEXT: vcvtps2pd %xmm0, %ymm0 {%k1} {z}
-; VL-NEXT: retq
- %a = fpext <4 x float> %b to <4 x double>
- %mask = fcmp ogt <4 x double> %a1, %b1
- %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer
- ret <4 x double> %c
-}
-
-define <4 x double> @f32to4f64_mask_load(<4 x float>* %p, <4 x double> %b1, <4 x double> %a1) {
-; NOVL-LABEL: f32to4f64_mask_load:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; NOVL-NEXT: vcvtps2pd (%rdi), %ymm2
-; NOVL-NEXT: vcmpltpd %zmm1, %zmm0, %k1
-; NOVL-NEXT: vmovapd %zmm2, %zmm0 {%k1} {z}
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT: retq
-;
-; VL-LABEL: f32to4f64_mask_load:
-; VL: # %bb.0:
-; VL-NEXT: vcmpltpd %ymm1, %ymm0, %k1
-; VL-NEXT: vcvtps2pd (%rdi), %ymm0 {%k1} {z}
-; VL-NEXT: retq
- %b = load <4 x float>, <4 x float>* %p
- %a = fpext <4 x float> %b to <4 x double>
- %mask = fcmp ogt <4 x double> %a1, %b1
- %c = select <4 x i1> %mask, <4 x double> %a, <4 x double> zeroinitializer
- ret <4 x double> %c
-}
-
-define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind {
-; ALL-LABEL: f32tof64_inreg:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtss2sd %xmm1, %xmm0, %xmm0
-; ALL-NEXT: retq
- %ext = extractelement <4 x float> %a1, i32 0
- %cvt = fpext float %ext to double
- %res = insertelement <2 x double> %a0, double %cvt, i32 0
- ret <2 x double> %res
-}
-
-define double @sltof64_load(i64* nocapture %e) {
-; ALL-LABEL: sltof64_load:
-; ALL: # %bb.0: # %entry
-; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0
-; ALL-NEXT: retq
-entry:
- %tmp1 = load i64, i64* %e, align 8
- %conv = sitofp i64 %tmp1 to double
- ret double %conv
-}
-
-define double @sitof64_load(i32* %e) {
-; ALL-LABEL: sitof64_load:
-; ALL: # %bb.0: # %entry
-; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0
-; ALL-NEXT: retq
-entry:
- %tmp1 = load i32, i32* %e, align 4
- %conv = sitofp i32 %tmp1 to double
- ret double %conv
-}
-
-define float @sitof32_load(i32* %e) {
-; ALL-LABEL: sitof32_load:
-; ALL: # %bb.0: # %entry
-; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0
-; ALL-NEXT: retq
-entry:
- %tmp1 = load i32, i32* %e, align 4
- %conv = sitofp i32 %tmp1 to float
- ret float %conv
-}
-
-define float @sltof32_load(i64* %e) {
-; ALL-LABEL: sltof32_load:
-; ALL: # %bb.0: # %entry
-; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0
-; ALL-NEXT: retq
-entry:
- %tmp1 = load i64, i64* %e, align 8
- %conv = sitofp i64 %tmp1 to float
- ret float %conv
-}
-
-define void @f32tof64_loadstore() {
-; ALL-LABEL: f32tof64_loadstore:
-; ALL: # %bb.0: # %entry
-; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ALL-NEXT: vcvtss2sd %xmm0, %xmm0, %xmm0
-; ALL-NEXT: vmovsd %xmm0, -{{[0-9]+}}(%rsp)
-; ALL-NEXT: retq
-entry:
- %f = alloca float, align 4
- %d = alloca double, align 8
- %tmp = load float, float* %f, align 4
- %conv = fpext float %tmp to double
- store double %conv, double* %d, align 8
- ret void
-}
-
-define void @f64tof32_loadstore() nounwind uwtable {
-; ALL-LABEL: f64tof32_loadstore:
-; ALL: # %bb.0: # %entry
-; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; ALL-NEXT: vcvtsd2ss %xmm0, %xmm0, %xmm0
-; ALL-NEXT: vmovss %xmm0, -{{[0-9]+}}(%rsp)
-; ALL-NEXT: retq
-entry:
- %f = alloca float, align 4
- %d = alloca double, align 8
- %tmp = load double, double* %d, align 8
- %conv = fptrunc double %tmp to float
- store float %conv, float* %f, align 4
- ret void
-}
-
-define double @long_to_double(i64 %x) {
-; ALL-LABEL: long_to_double:
-; ALL: # %bb.0:
-; ALL-NEXT: vmovq %rdi, %xmm0
-; ALL-NEXT: retq
- %res = bitcast i64 %x to double
- ret double %res
-}
-
-define i64 @double_to_long(double %x) {
-; ALL-LABEL: double_to_long:
-; ALL: # %bb.0:
-; ALL-NEXT: vmovq %xmm0, %rax
-; ALL-NEXT: retq
- %res = bitcast double %x to i64
- ret i64 %res
-}
-
-define float @int_to_float(i32 %x) {
-; ALL-LABEL: int_to_float:
-; ALL: # %bb.0:
-; ALL-NEXT: vmovd %edi, %xmm0
-; ALL-NEXT: retq
- %res = bitcast i32 %x to float
- ret float %res
-}
-
-define i32 @float_to_int(float %x) {
-; ALL-LABEL: float_to_int:
-; ALL: # %bb.0:
-; ALL-NEXT: vmovd %xmm0, %eax
-; ALL-NEXT: retq
- %res = bitcast float %x to i32
- ret i32 %res
-}
-
-define <16 x double> @uito16f64(<16 x i32> %a) nounwind {
-; ALL-LABEL: uito16f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtudq2pd %ymm0, %zmm2
-; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT: vcvtudq2pd %ymm0, %zmm1
-; ALL-NEXT: vmovaps %zmm2, %zmm0
-; ALL-NEXT: retq
- %b = uitofp <16 x i32> %a to <16 x double>
- ret <16 x double> %b
-}
-
-define <8 x float> @slto8f32(<8 x i64> %a) {
-; NODQ-LABEL: slto8f32:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: slto8f32:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: slto8f32:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT: retq
- %b = sitofp <8 x i64> %a to <8 x float>
- ret <8 x float> %b
-}
-
-define <16 x float> @slto16f32(<16 x i64> %a) {
-; NODQ-LABEL: slto16f32:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3
-; NODQ-NEXT: vmovq %xmm3, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; NODQ-NEXT: vpextrq $1, %xmm3, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3
-; NODQ-NEXT: vmovq %xmm3, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; NODQ-NEXT: vpextrq $1, %xmm3, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: slto16f32:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; VLDQ-NEXT: vcvtqq2ps %zmm1, %ymm1
-; VLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: slto16f32:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvtqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT: vcvtqq2ps %zmm1, %ymm1
-; DQNOVL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; DQNOVL-NEXT: retq
- %b = sitofp <16 x i64> %a to <16 x float>
- ret <16 x float> %b
-}
-
-define <8 x double> @slto8f64(<8 x i64> %a) {
-; NODQ-LABEL: slto8f64:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: slto8f64:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtqq2pd %zmm0, %zmm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: slto8f64:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT: retq
- %b = sitofp <8 x i64> %a to <8 x double>
- ret <8 x double> %b
-}
-
-define <16 x double> @slto16f64(<16 x i64> %a) {
-; NODQ-LABEL: slto16f64:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3
-; NODQ-NEXT: vpextrq $1, %xmm3, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4
-; NODQ-NEXT: vmovq %xmm3, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3
-; NODQ-NEXT: vpextrq $1, %xmm3, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4
-; NODQ-NEXT: vmovq %xmm3, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
-; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm2
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3
-; NODQ-NEXT: vpextrq $1, %xmm3, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4
-; NODQ-NEXT: vmovq %xmm3, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3
-; NODQ-NEXT: vpextrq $1, %xmm3, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4
-; NODQ-NEXT: vmovq %xmm3, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1
-; NODQ-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: slto16f64:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtqq2pd %zmm0, %zmm0
-; VLDQ-NEXT: vcvtqq2pd %zmm1, %zmm1
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: slto16f64:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvtqq2pd %zmm0, %zmm0
-; DQNOVL-NEXT: vcvtqq2pd %zmm1, %zmm1
-; DQNOVL-NEXT: retq
- %b = sitofp <16 x i64> %a to <16 x double>
- ret <16 x double> %b
-}
-
-define <8 x float> @ulto8f32(<8 x i64> %a) {
-; NODQ-LABEL: ulto8f32:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: ulto8f32:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: ulto8f32:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT: retq
- %b = uitofp <8 x i64> %a to <8 x float>
- ret <8 x float> %b
-}
-
-define <16 x float> @ulto16f32(<16 x i64> %a) {
-; NODQ-LABEL: ulto16f32:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3
-; NODQ-NEXT: vmovq %xmm3, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; NODQ-NEXT: vpextrq $1, %xmm3, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1
-; NODQ-NEXT: vmovq %xmm1, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; NODQ-NEXT: vpextrq $1, %xmm1, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1
-; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; NODQ-NEXT: vpextrq $1, %xmm2, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovq %xmm2, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
-; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm3
-; NODQ-NEXT: vmovq %xmm3, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; NODQ-NEXT: vpextrq $1, %xmm3, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
-; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT: vmovq %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4
-; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3]
-; NODQ-NEXT: vpextrq $1, %xmm0, %rax
-; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0
-; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0]
-; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: ulto16f32:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; VLDQ-NEXT: vcvtuqq2ps %zmm1, %ymm1
-; VLDQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: ulto16f32:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; DQNOVL-NEXT: vcvtuqq2ps %zmm1, %ymm1
-; DQNOVL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
-; DQNOVL-NEXT: retq
- %b = uitofp <16 x i64> %a to <16 x float>
- ret <16 x float> %b
-}
-
-define <8 x double> @uito8f64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; KNL-LABEL: uito8f64_mask:
-; KNL: # %bb.0:
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
-; KNL-NEXT: retq
-;
-; VLBW-LABEL: uito8f64_mask:
-; VLBW: # %bb.0:
-; VLBW-NEXT: kmovd %edi, %k1
-; VLBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
-; VLBW-NEXT: retq
-;
-; VLNOBW-LABEL: uito8f64_mask:
-; VLNOBW: # %bb.0:
-; VLNOBW-NEXT: kmovw %edi, %k1
-; VLNOBW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
-; VLNOBW-NEXT: retq
-;
-; DQNOVL-LABEL: uito8f64_mask:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: kmovw %edi, %k1
-; DQNOVL-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
-; DQNOVL-NEXT: retq
-;
-; AVX512BW-LABEL: uito8f64_mask:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vcvtudq2pd %ymm1, %zmm0 {%k1}
-; AVX512BW-NEXT: retq
- %1 = bitcast i8 %c to <8 x i1>
- %2 = uitofp <8 x i32> %b to <8 x double>
- %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
- ret <8 x double> %3
-}
-define <8 x double> @uito8f64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; KNL-LABEL: uito8f64_maskz:
-; KNL: # %bb.0:
-; KNL-NEXT: kmovw %edi, %k1
-; KNL-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; KNL-NEXT: retq
-;
-; VLBW-LABEL: uito8f64_maskz:
-; VLBW: # %bb.0:
-; VLBW-NEXT: kmovd %edi, %k1
-; VLBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; VLBW-NEXT: retq
-;
-; VLNOBW-LABEL: uito8f64_maskz:
-; VLNOBW: # %bb.0:
-; VLNOBW-NEXT: kmovw %edi, %k1
-; VLNOBW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; VLNOBW-NEXT: retq
-;
-; DQNOVL-LABEL: uito8f64_maskz:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: kmovw %edi, %k1
-; DQNOVL-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: retq
-;
-; AVX512BW-LABEL: uito8f64_maskz:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: kmovd %edi, %k1
-; AVX512BW-NEXT: vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT: retq
- %1 = bitcast i8 %b to <8 x i1>
- %2 = uitofp <8 x i32> %a to <8 x double>
- %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
- ret <8 x double> %3
-}
-
-define <4 x double> @uito4f64(<4 x i32> %a) nounwind {
-; NOVL-LABEL: uito4f64:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; NOVL-NEXT: vcvtudq2pd %ymm0, %zmm0
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT: retq
-;
-; VL-LABEL: uito4f64:
-; VL: # %bb.0:
-; VL-NEXT: vcvtudq2pd %xmm0, %ymm0
-; VL-NEXT: retq
- %b = uitofp <4 x i32> %a to <4 x double>
- ret <4 x double> %b
-}
-
-define <16 x float> @uito16f32(<16 x i32> %a) nounwind {
-; ALL-LABEL: uito16f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtudq2ps %zmm0, %zmm0
-; ALL-NEXT: retq
- %b = uitofp <16 x i32> %a to <16 x float>
- ret <16 x float> %b
-}
-
-define <8 x double> @uito8f64(<8 x i32> %a) {
-; ALL-LABEL: uito8f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtudq2pd %ymm0, %zmm0
-; ALL-NEXT: retq
- %b = uitofp <8 x i32> %a to <8 x double>
- ret <8 x double> %b
-}
-
-define <8 x float> @uito8f32(<8 x i32> %a) nounwind {
-; NOVL-LABEL: uito8f32:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
-; NOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVL-NEXT: retq
-;
-; VL-LABEL: uito8f32:
-; VL: # %bb.0:
-; VL-NEXT: vcvtudq2ps %ymm0, %ymm0
-; VL-NEXT: retq
- %b = uitofp <8 x i32> %a to <8 x float>
- ret <8 x float> %b
-}
-
-define <4 x float> @uito4f32(<4 x i32> %a) nounwind {
-; NOVL-LABEL: uito4f32:
-; NOVL: # %bb.0:
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; NOVL-NEXT: vcvtudq2ps %zmm0, %zmm0
-; NOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVL-NEXT: vzeroupper
-; NOVL-NEXT: retq
-;
-; VL-LABEL: uito4f32:
-; VL: # %bb.0:
-; VL-NEXT: vcvtudq2ps %xmm0, %xmm0
-; VL-NEXT: retq
- %b = uitofp <4 x i32> %a to <4 x float>
- ret <4 x float> %b
-}
-
-define i32 @fptosi(float %a) nounwind {
-; ALL-LABEL: fptosi:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttss2si %xmm0, %eax
-; ALL-NEXT: retq
- %b = fptosi float %a to i32
- ret i32 %b
-}
-
-define i32 @fptoui(float %a) nounwind {
-; ALL-LABEL: fptoui:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvttss2usi %xmm0, %eax
-; ALL-NEXT: retq
- %b = fptoui float %a to i32
- ret i32 %b
-}
-
-define float @uitof32(i32 %a) nounwind {
-; ALL-LABEL: uitof32:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0
-; ALL-NEXT: retq
- %b = uitofp i32 %a to float
- ret float %b
-}
-
-define double @uitof64(i32 %a) nounwind {
-; ALL-LABEL: uitof64:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0
-; ALL-NEXT: retq
- %b = uitofp i32 %a to double
- ret double %b
-}
-
-define <16 x float> @sbto16f32(<16 x i32> %a) {
-; NODQ-LABEL: sbto16f32:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: sbto16f32:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vpmovd2m %zmm0, %k0
-; VLDQ-NEXT: vpmovm2d %k0, %zmm0
-; VLDQ-NEXT: vcvtdq2ps %zmm0, %zmm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: sbto16f32:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k0
-; DQNOVL-NEXT: vpmovm2d %k0, %zmm0
-; DQNOVL-NEXT: vcvtdq2ps %zmm0, %zmm0
-; DQNOVL-NEXT: retq
- %mask = icmp slt <16 x i32> %a, zeroinitializer
- %1 = sitofp <16 x i1> %mask to <16 x float>
- ret <16 x float> %1
-}
-
-define <16 x float> @scto16f32(<16 x i8> %a) {
-; ALL-LABEL: scto16f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd %xmm0, %zmm0
-; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT: retq
- %1 = sitofp <16 x i8> %a to <16 x float>
- ret <16 x float> %1
-}
-
-define <16 x float> @ssto16f32(<16 x i16> %a) {
-; ALL-LABEL: ssto16f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxwd %ymm0, %zmm0
-; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT: retq
- %1 = sitofp <16 x i16> %a to <16 x float>
- ret <16 x float> %1
-}
-
-define <8 x double> @ssto16f64(<8 x i16> %a) {
-; ALL-LABEL: ssto16f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxwd %xmm0, %ymm0
-; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT: retq
- %1 = sitofp <8 x i16> %a to <8 x double>
- ret <8 x double> %1
-}
-
-define <8 x double> @scto8f64(<8 x i8> %a) {
-; ALL-LABEL: scto8f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd %xmm0, %ymm0
-; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT: retq
- %1 = sitofp <8 x i8> %a to <8 x double>
- ret <8 x double> %1
-}
-
-define <16 x double> @scto16f64(<16 x i8> %a) {
-; ALL-LABEL: scto16f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxbd %xmm0, %zmm1
-; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0
-; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1
-; ALL-NEXT: retq
- %b = sitofp <16 x i8> %a to <16 x double>
- ret <16 x double> %b
-}
-
-define <16 x double> @sbto16f64(<16 x double> %a) {
-; NODQ-LABEL: sbto16f64:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; NODQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0
-; NODQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1
-; NODQ-NEXT: kunpckbw %k0, %k1, %k1
-; NODQ-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
-; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm0
-; NODQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm1
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: sbto16f64:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; VLDQ-NEXT: vcmpltpd %zmm0, %zmm2, %k0
-; VLDQ-NEXT: vcmpltpd %zmm1, %zmm2, %k1
-; VLDQ-NEXT: kunpckbw %k0, %k1, %k0
-; VLDQ-NEXT: vpmovm2d %k0, %zmm1
-; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm0
-; VLDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: sbto16f64:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; DQNOVL-NEXT: vcmpltpd %zmm0, %zmm2, %k0
-; DQNOVL-NEXT: vcmpltpd %zmm1, %zmm2, %k1
-; DQNOVL-NEXT: kunpckbw %k0, %k1, %k0
-; DQNOVL-NEXT: vpmovm2d %k0, %zmm1
-; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0
-; DQNOVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm1
-; DQNOVL-NEXT: retq
- %cmpres = fcmp ogt <16 x double> %a, zeroinitializer
- %1 = sitofp <16 x i1> %cmpres to <16 x double>
- ret <16 x double> %1
-}
-
-define <8 x double> @sbto8f64(<8 x double> %a) {
-; NOVLDQ-LABEL: sbto8f64:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; NOVLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1
-; NOVLDQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: sbto8f64:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; VLDQ-NEXT: vcmpltpd %zmm0, %zmm1, %k0
-; VLDQ-NEXT: vpmovm2d %k0, %ymm0
-; VLDQ-NEXT: vcvtdq2pd %ymm0, %zmm0
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: sbto8f64:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; VLNODQ-NEXT: vcmpltpd %zmm0, %zmm1, %k1
-; VLNODQ-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; VLNODQ-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; VLNODQ-NEXT: vcvtdq2pd %ymm0, %zmm0
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: sbto8f64:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; DQNOVL-NEXT: vcmpltpd %zmm0, %zmm1, %k0
-; DQNOVL-NEXT: vpmovm2d %k0, %zmm0
-; DQNOVL-NEXT: vcvtdq2pd %ymm0, %zmm0
-; DQNOVL-NEXT: retq
- %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
- %1 = sitofp <8 x i1> %cmpres to <8 x double>
- ret <8 x double> %1
-}
-
-define <8 x float> @sbto8f32(<8 x float> %a) {
-; ALL-LABEL: sbto8f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
-; ALL-NEXT: vcvtdq2ps %ymm0, %ymm0
-; ALL-NEXT: retq
- %cmpres = fcmp ogt <8 x float> %a, zeroinitializer
- %1 = sitofp <8 x i1> %cmpres to <8 x float>
- ret <8 x float> %1
-}
-
-define <4 x float> @sbto4f32(<4 x float> %a) {
-; ALL-LABEL: sbto4f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
-; ALL-NEXT: vcvtdq2ps %xmm0, %xmm0
-; ALL-NEXT: retq
- %cmpres = fcmp ogt <4 x float> %a, zeroinitializer
- %1 = sitofp <4 x i1> %cmpres to <4 x float>
- ret <4 x float> %1
-}
-
-define <4 x double> @sbto4f64(<4 x double> %a) {
-; NOVL-LABEL: sbto4f64:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; NOVL-NEXT: vcmpltpd %ymm0, %ymm1, %ymm0
-; NOVL-NEXT: vpmovqd %zmm0, %ymm0
-; NOVL-NEXT: vcvtdq2pd %xmm0, %ymm0
-; NOVL-NEXT: retq
-;
-; VLDQ-LABEL: sbto4f64:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; VLDQ-NEXT: vcmpltpd %ymm0, %ymm1, %k0
-; VLDQ-NEXT: vpmovm2d %k0, %xmm0
-; VLDQ-NEXT: vcvtdq2pd %xmm0, %ymm0
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: sbto4f64:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; VLNODQ-NEXT: vcmpltpd %ymm0, %ymm1, %k1
-; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; VLNODQ-NEXT: vcvtdq2pd %xmm0, %ymm0
-; VLNODQ-NEXT: retq
- %cmpres = fcmp ogt <4 x double> %a, zeroinitializer
- %1 = sitofp <4 x i1> %cmpres to <4 x double>
- ret <4 x double> %1
-}
-
-define <2 x float> @sbto2f32(<2 x float> %a) {
-; ALL-LABEL: sbto2f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vcmpltps %xmm0, %xmm1, %xmm0
-; ALL-NEXT: vcvtdq2ps %xmm0, %xmm0
-; ALL-NEXT: retq
- %cmpres = fcmp ogt <2 x float> %a, zeroinitializer
- %1 = sitofp <2 x i1> %cmpres to <2 x float>
- ret <2 x float> %1
-}
-
-define <2 x double> @sbto2f64(<2 x double> %a) {
-; NOVL-LABEL: sbto2f64:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; NOVL-NEXT: vcmpltpd %xmm0, %xmm1, %xmm0
-; NOVL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0
-; NOVL-NEXT: retq
-;
-; VLDQ-LABEL: sbto2f64:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; VLDQ-NEXT: vcmpltpd %xmm0, %xmm1, %k0
-; VLDQ-NEXT: vpmovm2d %k0, %xmm0
-; VLDQ-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: sbto2f64:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vxorpd %xmm1, %xmm1, %xmm1
-; VLNODQ-NEXT: vcmpltpd %xmm0, %xmm1, %k1
-; VLNODQ-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
-; VLNODQ-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
-; VLNODQ-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VLNODQ-NEXT: retq
- %cmpres = fcmp ogt <2 x double> %a, zeroinitializer
- %1 = sitofp <2 x i1> %cmpres to <2 x double>
- ret <2 x double> %1
-}
-
-define <16 x float> @ucto16f32(<16 x i8> %a) {
-; ALL-LABEL: ucto16f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT: retq
- %b = uitofp <16 x i8> %a to <16 x float>
- ret <16 x float>%b
-}
-
-define <8 x double> @ucto8f64(<8 x i8> %a) {
-; ALL-LABEL: ucto8f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT: retq
- %b = uitofp <8 x i8> %a to <8 x double>
- ret <8 x double> %b
-}
-
-define <16 x float> @swto16f32(<16 x i16> %a) {
-; ALL-LABEL: swto16f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxwd %ymm0, %zmm0
-; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT: retq
- %b = sitofp <16 x i16> %a to <16 x float>
- ret <16 x float> %b
-}
-
-define <8 x double> @swto8f64(<8 x i16> %a) {
-; ALL-LABEL: swto8f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxwd %xmm0, %ymm0
-; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT: retq
- %b = sitofp <8 x i16> %a to <8 x double>
- ret <8 x double> %b
-}
-
-define <16 x double> @swto16f64(<16 x i16> %a) {
-; ALL-LABEL: swto16f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovsxwd %ymm0, %zmm1
-; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0
-; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1
-; ALL-NEXT: retq
- %b = sitofp <16 x i16> %a to <16 x double>
- ret <16 x double> %b
-}
-
-define <16 x double> @ucto16f64(<16 x i8> %a) {
-; ALL-LABEL: ucto16f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0
-; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1
-; ALL-NEXT: retq
- %b = uitofp <16 x i8> %a to <16 x double>
- ret <16 x double> %b
-}
-
-define <16 x float> @uwto16f32(<16 x i16> %a) {
-; ALL-LABEL: uwto16f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT: retq
- %b = uitofp <16 x i16> %a to <16 x float>
- ret <16 x float> %b
-}
-
-define <8 x double> @uwto8f64(<8 x i16> %a) {
-; ALL-LABEL: uwto8f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT: retq
- %b = uitofp <8 x i16> %a to <8 x double>
- ret <8 x double> %b
-}
-
-define <16 x double> @uwto16f64(<16 x i16> %a) {
-; ALL-LABEL: uwto16f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; ALL-NEXT: vcvtdq2pd %ymm1, %zmm0
-; ALL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; ALL-NEXT: vcvtdq2pd %ymm1, %zmm1
-; ALL-NEXT: retq
- %b = uitofp <16 x i16> %a to <16 x double>
- ret <16 x double> %b
-}
-
-define <16 x float> @sito16f32(<16 x i32> %a) {
-; ALL-LABEL: sito16f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT: retq
- %b = sitofp <16 x i32> %a to <16 x float>
- ret <16 x float> %b
-}
-
-define <16 x double> @sito16f64(<16 x i32> %a) {
-; ALL-LABEL: sito16f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vcvtdq2pd %ymm0, %zmm2
-; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT: vcvtdq2pd %ymm0, %zmm1
-; ALL-NEXT: vmovaps %zmm2, %zmm0
-; ALL-NEXT: retq
- %b = sitofp <16 x i32> %a to <16 x double>
- ret <16 x double> %b
-}
-
-define <16 x float> @usto16f32(<16 x i16> %a) {
-; ALL-LABEL: usto16f32:
-; ALL: # %bb.0:
-; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; ALL-NEXT: vcvtdq2ps %zmm0, %zmm0
-; ALL-NEXT: retq
- %b = uitofp <16 x i16> %a to <16 x float>
- ret <16 x float> %b
-}
-
-define <16 x float> @ubto16f32(<16 x i32> %a) {
-; NODQ-LABEL: ubto16f32:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NODQ-NEXT: vpsrld $31, %zmm0, %zmm0
-; NODQ-NEXT: vcvtdq2ps %zmm0, %zmm0
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: ubto16f32:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vpmovd2m %zmm0, %k0
-; VLDQ-NEXT: vpmovm2d %k0, %zmm0
-; VLDQ-NEXT: vpsrld $31, %zmm0, %zmm0
-; VLDQ-NEXT: vcvtdq2ps %zmm0, %zmm0
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: ubto16f32:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k0
-; DQNOVL-NEXT: vpmovm2d %k0, %zmm0
-; DQNOVL-NEXT: vpsrld $31, %zmm0, %zmm0
-; DQNOVL-NEXT: vcvtdq2ps %zmm0, %zmm0
-; DQNOVL-NEXT: retq
- %mask = icmp slt <16 x i32> %a, zeroinitializer
- %1 = uitofp <16 x i1> %mask to <16 x float>
- ret <16 x float> %1
-}
-
-define <16 x double> @ubto16f64(<16 x i32> %a) {
-; NODQ-LABEL: ubto16f64:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; NODQ-NEXT: vpcmpgtd %zmm0, %zmm1, %k1
-; NODQ-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; NODQ-NEXT: vpsrld $31, %zmm0, %zmm1
-; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm0
-; NODQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; NODQ-NEXT: vcvtdq2pd %ymm1, %zmm1
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: ubto16f64:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vpmovd2m %zmm0, %k0
-; VLDQ-NEXT: vpmovm2d %k0, %zmm0
-; VLDQ-NEXT: vpsrld $31, %zmm0, %zmm1
-; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm0
-; VLDQ-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; VLDQ-NEXT: vcvtdq2pd %ymm1, %zmm1
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: ubto16f64:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k0
-; DQNOVL-NEXT: vpmovm2d %k0, %zmm0
-; DQNOVL-NEXT: vpsrld $31, %zmm0, %zmm1
-; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm0
-; DQNOVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; DQNOVL-NEXT: vcvtdq2pd %ymm1, %zmm1
-; DQNOVL-NEXT: retq
- %mask = icmp slt <16 x i32> %a, zeroinitializer
- %1 = uitofp <16 x i1> %mask to <16 x double>
- ret <16 x double> %1
-}
-
-define <8 x float> @ubto8f32(<8 x i32> %a) {
-; NOVL-LABEL: ubto8f32:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; NOVL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
-; NOVL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216]
-; NOVL-NEXT: vpand %ymm1, %ymm0, %ymm0
-; NOVL-NEXT: retq
-;
-; VL-LABEL: ubto8f32:
-; VL: # %bb.0:
-; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VL-NEXT: vpcmpgtd %ymm0, %ymm1, %ymm0
-; VL-NEXT: vpandd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; VL-NEXT: retq
- %mask = icmp slt <8 x i32> %a, zeroinitializer
- %1 = uitofp <8 x i1> %mask to <8 x float>
- ret <8 x float> %1
-}
-
-define <8 x double> @ubto8f64(<8 x i32> %a) {
-; ALL-LABEL: ubto8f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpsrld $31, %ymm0, %ymm0
-; ALL-NEXT: vcvtdq2pd %ymm0, %zmm0
-; ALL-NEXT: retq
- %mask = icmp slt <8 x i32> %a, zeroinitializer
- %1 = uitofp <8 x i1> %mask to <8 x double>
- ret <8 x double> %1
-}
-
-define <4 x float> @ubto4f32(<4 x i32> %a) {
-; NOVL-LABEL: ubto4f32:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; NOVL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216]
-; NOVL-NEXT: vpand %xmm1, %xmm0, %xmm0
-; NOVL-NEXT: retq
-;
-; VL-LABEL: ubto4f32:
-; VL: # %bb.0:
-; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VL-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; VL-NEXT: vpandd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; VL-NEXT: retq
- %mask = icmp slt <4 x i32> %a, zeroinitializer
- %1 = uitofp <4 x i1> %mask to <4 x float>
- ret <4 x float> %1
-}
-
-define <4 x double> @ubto4f64(<4 x i32> %a) {
-; ALL-LABEL: ubto4f64:
-; ALL: # %bb.0:
-; ALL-NEXT: vpsrld $31, %xmm0, %xmm0
-; ALL-NEXT: vcvtdq2pd %xmm0, %ymm0
-; ALL-NEXT: retq
- %mask = icmp slt <4 x i32> %a, zeroinitializer
- %1 = uitofp <4 x i1> %mask to <4 x double>
- ret <4 x double> %1
-}
-
-define <2 x float> @ubto2f32(<2 x i32> %a) {
-; NOVL-LABEL: ubto2f32:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1065353216,1065353216,1065353216,1065353216]
-; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; NOVL-NEXT: retq
-;
-; VL-LABEL: ubto2f32:
-; VL: # %bb.0:
-; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; VL-NEXT: retq
- %mask = icmp ne <2 x i32> %a, zeroinitializer
- %1 = uitofp <2 x i1> %mask to <2 x float>
- ret <2 x float> %1
-}
-
-define <2 x double> @ubto2f64(<2 x i32> %a) {
-; NOVL-LABEL: ubto2f64:
-; NOVL: # %bb.0:
-; NOVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; NOVL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; NOVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
-; NOVL-NEXT: vpandn %xmm1, %xmm0, %xmm0
-; NOVL-NEXT: vcvtdq2pd %xmm0, %xmm0
-; NOVL-NEXT: retq
-;
-; VL-LABEL: ubto2f64:
-; VL: # %bb.0:
-; VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VL-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
-; VL-NEXT: vpandnd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; VL-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VL-NEXT: retq
- %mask = icmp ne <2 x i32> %a, zeroinitializer
- %1 = uitofp <2 x i1> %mask to <2 x double>
- ret <2 x double> %1
-}
-
-define <2 x i64> @test_2f64toub(<2 x double> %a, <2 x i64> %passthru) {
-; NOVLDQ-LABEL: test_2f64toub:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; NOVLDQ-NEXT: vcvttpd2udq %zmm0, %ymm0
-; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0
-; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT: vzeroupper
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_2f64toub:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0
-; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VLDQ-NEXT: vpmovd2m %xmm0, %k1
-; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_2f64toub:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vcvttpd2udq %xmm0, %xmm0
-; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_2f64toub:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; DQNOVL-NEXT: vcvttpd2udq %zmm0, %ymm0
-; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT: vzeroupper
-; DQNOVL-NEXT: retq
- %mask = fptoui <2 x double> %a to <2 x i1>
- %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
- ret <2 x i64> %select
-}
-
-define <4 x i64> @test_4f64toub(<4 x double> %a, <4 x i64> %passthru) {
-; NOVLDQ-LABEL: test_4f64toub:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0
-; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0
-; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_4f64toub:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0
-; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VLDQ-NEXT: vpmovd2m %xmm0, %k1
-; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_4f64toub:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0
-; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_4f64toub:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0
-; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT: retq
- %mask = fptoui <4 x double> %a to <4 x i1>
- %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
- ret <4 x i64> %select
-}
-
-define <8 x i64> @test_8f64toub(<8 x double> %a, <8 x i64> %passthru) {
-; NOVLDQ-LABEL: test_8f64toub:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0
-; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0
-; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_8f64toub:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0
-; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
-; VLDQ-NEXT: vpmovd2m %ymm0, %k1
-; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_8f64toub:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0
-; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0
-; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1
-; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_8f64toub:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: retq
- %mask = fptoui <8 x double> %a to <8 x i1>
- %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
- ret <8 x i64> %select
-}
-
-define <2 x i64> @test_2f32toub(<2 x float> %a, <2 x i64> %passthru) {
-; NOVLDQ-LABEL: test_2f32toub:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0
-; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT: vzeroupper
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_2f32toub:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VLDQ-NEXT: vpmovd2m %xmm0, %k1
-; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_2f32toub:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_2f32toub:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0
-; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT: vzeroupper
-; DQNOVL-NEXT: retq
- %mask = fptoui <2 x float> %a to <2 x i1>
- %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
- ret <2 x i64> %select
-}
-
-define <4 x i64> @test_4f32toub(<4 x float> %a, <4 x i64> %passthru) {
-; NOVLDQ-LABEL: test_4f32toub:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0
-; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_4f32toub:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VLDQ-NEXT: vpmovd2m %xmm0, %k1
-; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_4f32toub:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_4f32toub:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0
-; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT: retq
- %mask = fptoui <4 x float> %a to <4 x i1>
- %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
- ret <4 x i64> %select
-}
-
-define <8 x i64> @test_8f32toub(<8 x float> %a, <8 x i64> %passthru) {
-; NOVLDQ-LABEL: test_8f32toub:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0
-; NOVLDQ-NEXT: vpslld $31, %ymm0, %ymm0
-; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_8f32toub:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0
-; VLDQ-NEXT: vpslld $31, %ymm0, %ymm0
-; VLDQ-NEXT: vpmovd2m %ymm0, %k1
-; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_8f32toub:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0
-; VLNODQ-NEXT: vpslld $31, %ymm0, %ymm0
-; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1
-; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_8f32toub:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0
-; DQNOVL-NEXT: vpslld $31, %ymm0, %ymm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: retq
- %mask = fptoui <8 x float> %a to <8 x i1>
- %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
- ret <8 x i64> %select
-}
-
-define <16 x i32> @test_16f32toub(<16 x float> %a, <16 x i32> %passthru) {
-; NODQ-LABEL: test_16f32toub:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0
-; NODQ-NEXT: vpslld $31, %zmm0, %zmm0
-; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: test_16f32toub:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0
-; VLDQ-NEXT: vpslld $31, %zmm0, %zmm0
-; VLDQ-NEXT: vpmovd2m %zmm0, %k1
-; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_16f32toub:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0
-; DQNOVL-NEXT: vpslld $31, %zmm0, %zmm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: retq
- %mask = fptoui <16 x float> %a to <16 x i1>
- %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
- ret <16 x i32> %select
-}
-
-define <2 x i64> @test_2f64tosb(<2 x double> %a, <2 x i64> %passthru) {
-; NOVLDQ-LABEL: test_2f64tosb:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; NOVLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0
-; NOVLDQ-NEXT: vpslld $31, %xmm0, %xmm0
-; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT: vzeroupper
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_2f64tosb:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttpd2dq %xmm0, %xmm0
-; VLDQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VLDQ-NEXT: vpmovd2m %xmm0, %k1
-; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_2f64tosb:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vcvttpd2dq %xmm0, %xmm0
-; VLNODQ-NEXT: vpslld $31, %xmm0, %xmm0
-; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_2f64tosb:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; DQNOVL-NEXT: vcvttpd2dq %xmm0, %xmm0
-; DQNOVL-NEXT: vpslld $31, %xmm0, %xmm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT: vzeroupper
-; DQNOVL-NEXT: retq
- %mask = fptosi <2 x double> %a to <2 x i1>
- %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
- ret <2 x i64> %select
-}
-
-define <4 x i64> @test_4f64tosb(<4 x double> %a, <4 x i64> %passthru) {
-; NOVLDQ-LABEL: test_4f64tosb:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0
-; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_4f64tosb:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttpd2dq %ymm0, %xmm0
-; VLDQ-NEXT: vpmovd2m %xmm0, %k1
-; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_4f64tosb:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vcvttpd2dq %ymm0, %xmm0
-; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_4f64tosb:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; DQNOVL-NEXT: vcvttpd2dq %ymm0, %xmm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT: retq
- %mask = fptosi <4 x double> %a to <4 x i1>
- %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
- ret <4 x i64> %select
-}
-
-define <8 x i64> @test_8f64tosb(<8 x double> %a, <8 x i64> %passthru) {
-; NOVLDQ-LABEL: test_8f64tosb:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0
-; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_8f64tosb:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0
-; VLDQ-NEXT: vpmovd2m %ymm0, %k1
-; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_8f64tosb:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vcvttpd2dq %zmm0, %ymm0
-; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1
-; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_8f64tosb:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: retq
- %mask = fptosi <8 x double> %a to <8 x i1>
- %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
- ret <8 x i64> %select
-}
-
-define <2 x i64> @test_2f32tosb(<2 x float> %a, <2 x i64> %passthru) {
-; NOVLDQ-LABEL: test_2f32tosb:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT: vzeroupper
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_2f32tosb:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; VLDQ-NEXT: vpmovd2m %xmm0, %k1
-; VLDQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_2f32tosb:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_2f32tosb:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT: vzeroupper
-; DQNOVL-NEXT: retq
- %mask = fptosi <2 x float> %a to <2 x i1>
- %select = select <2 x i1> %mask, <2 x i64> %passthru, <2 x i64> zeroinitializer
- ret <2 x i64> %select
-}
-
-define <4 x i64> @test_4f32tosb(<4 x float> %a, <4 x i64> %passthru) {
-; NOVLDQ-LABEL: test_4f32tosb:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; NOVLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_4f32tosb:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; VLDQ-NEXT: vpmovd2m %xmm0, %k1
-; VLDQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_4f32tosb:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vcvttps2dq %xmm0, %xmm0
-; VLNODQ-NEXT: vptestmd %xmm0, %xmm0, %k1
-; VLNODQ-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_4f32tosb:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; DQNOVL-NEXT: vcvttps2dq %xmm0, %xmm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; DQNOVL-NEXT: retq
- %mask = fptosi <4 x float> %a to <4 x i1>
- %select = select <4 x i1> %mask, <4 x i64> %passthru, <4 x i64> zeroinitializer
- ret <4 x i64> %select
-}
-
-define <8 x i64> @test_8f32tosb(<8 x float> %a, <8 x i64> %passthru) {
-; NOVLDQ-LABEL: test_8f32tosb:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: vcvttps2dq %ymm0, %ymm0
-; NOVLDQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NOVLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_8f32tosb:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttps2dq %ymm0, %ymm0
-; VLDQ-NEXT: vpmovd2m %ymm0, %k1
-; VLDQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_8f32tosb:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vcvttps2dq %ymm0, %ymm0
-; VLNODQ-NEXT: vptestmd %ymm0, %ymm0, %k1
-; VLNODQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_8f32tosb:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvttps2dq %ymm0, %ymm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: retq
- %mask = fptosi <8 x float> %a to <8 x i1>
- %select = select <8 x i1> %mask, <8 x i64> %passthru, <8 x i64> zeroinitializer
- ret <8 x i64> %select
-}
-
-define <16 x i32> @test_16f32tosb(<16 x float> %a, <16 x i32> %passthru) {
-; NODQ-LABEL: test_16f32tosb:
-; NODQ: # %bb.0:
-; NODQ-NEXT: vcvttps2dq %zmm0, %zmm0
-; NODQ-NEXT: vptestmd %zmm0, %zmm0, %k1
-; NODQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; NODQ-NEXT: retq
-;
-; VLDQ-LABEL: test_16f32tosb:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vcvttps2dq %zmm0, %zmm0
-; VLDQ-NEXT: vpmovd2m %zmm0, %k1
-; VLDQ-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_16f32tosb:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: vcvttps2dq %zmm0, %zmm0
-; DQNOVL-NEXT: vpmovd2m %zmm0, %k1
-; DQNOVL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: retq
- %mask = fptosi <16 x float> %a to <16 x i1>
- %select = select <16 x i1> %mask, <16 x i32> %passthru, <16 x i32> zeroinitializer
- ret <16 x i32> %select
-}
-
-define <2 x double> @test_sito2f64_mask_load(<2 x i32> *%a, <2 x i64> %c) {
-; SSE-LABEL: sitofp_load_2i32_to_2f64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_2i32_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
-; AVX-NEXT: retq
-; NOVLDQ-LABEL: test_sito2f64_mask_load:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; NOVLDQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
-; NOVLDQ-NEXT: vcvtdq2pd (%rdi), %xmm0
-; NOVLDQ-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT: vzeroupper
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_sito2f64_mask_load:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vpmovq2m %xmm0, %k1
-; VLDQ-NEXT: vcvtdq2pd (%rdi), %xmm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_sito2f64_mask_load:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VLNODQ-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
-; VLNODQ-NEXT: vcvtdq2pd (%rdi), %xmm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_sito2f64_mask_load:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; DQNOVL-NEXT: vpmovq2m %zmm0, %k1
-; DQNOVL-NEXT: vcvtdq2pd (%rdi), %xmm0
-; DQNOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT: vzeroupper
-; DQNOVL-NEXT: retq
- %mask = icmp slt <2 x i64> %c, zeroinitializer
- %ld = load <2 x i32>, <2 x i32> *%a
- %cvt = sitofp <2 x i32> %ld to <2 x double>
- %sel = select <2 x i1> %mask, <2 x double> %cvt, <2 x double> zeroinitializer
- ret <2 x double> %sel
-}
-
-define <2 x double> @test_uito2f64_mask_load(<2 x i32> *%a, <2 x i64> %c) {
-; SSE-LABEL: sitofp_load_2i32_to_2f64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_2i32_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
-; AVX-NEXT: retq
-; NOVLDQ-LABEL: test_uito2f64_mask_load:
-; NOVLDQ: # %bb.0:
-; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; NOVLDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; NOVLDQ-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
-; NOVLDQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; NOVLDQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; NOVLDQ-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
-; NOVLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; NOVLDQ-NEXT: vzeroupper
-; NOVLDQ-NEXT: retq
-;
-; VLDQ-LABEL: test_uito2f64_mask_load:
-; VLDQ: # %bb.0:
-; VLDQ-NEXT: vpmovq2m %xmm0, %k1
-; VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0 {%k1} {z}
-; VLDQ-NEXT: retq
-;
-; VLNODQ-LABEL: test_uito2f64_mask_load:
-; VLNODQ: # %bb.0:
-; VLNODQ-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VLNODQ-NEXT: vpcmpgtq %xmm0, %xmm1, %k1
-; VLNODQ-NEXT: vcvtudq2pd (%rdi), %xmm0 {%k1} {z}
-; VLNODQ-NEXT: retq
-;
-; DQNOVL-LABEL: test_uito2f64_mask_load:
-; DQNOVL: # %bb.0:
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; DQNOVL-NEXT: vpmovq2m %zmm0, %k1
-; DQNOVL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; DQNOVL-NEXT: vcvtudq2pd %ymm0, %zmm0
-; DQNOVL-NEXT: vmovapd %zmm0, %zmm0 {%k1} {z}
-; DQNOVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; DQNOVL-NEXT: vzeroupper
-; DQNOVL-NEXT: retq
- %mask = icmp slt <2 x i64> %c, zeroinitializer
- %ld = load <2 x i32>, <2 x i32> *%a
- %cvt = uitofp <2 x i32> %ld to <2 x double>
- %sel = select <2 x i1> %mask, <2 x double> %cvt, <2 x double> zeroinitializer
- ret <2 x double> %sel
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=KNL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-apple-darwin -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,SKX
-
- attributes #0 = { nounwind }
-
-define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) #0 {
-; ALL-LABEL: trunc_16x32_to_16x8:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovdb %zmm0, %xmm0
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x = trunc <16 x i32> %i to <16 x i8>
- ret <16 x i8> %x
-}
-
-define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) #0 {
-; ALL-LABEL: trunc_8x64_to_8x16:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovqw %zmm0, %xmm0
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x = trunc <8 x i64> %i to <8 x i16>
- ret <8 x i16> %x
-}
-
-define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) #0 {
-; ALL-LABEL: trunc_v16i32_to_v16i16:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovdw %zmm0, %ymm0
-; ALL-NEXT: retq
- %1 = trunc <16 x i32> %x to <16 x i16>
- ret <16 x i16> %1
-}
-
-define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 {
-; ALL-LABEL: trunc_qb_512:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovqb %zmm0, %xmm0
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x = trunc <8 x i64> %i to <8 x i8>
- ret <8 x i8> %x
-}
-
-define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 {
-; ALL-LABEL: trunc_qb_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovqb %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x = trunc <8 x i64> %i to <8 x i8>
- store <8 x i8> %x, <8 x i8>* %res
- ret void
-}
-
-define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 {
-; KNL-LABEL: trunc_qb_256:
-; KNL: ## %bb.0:
-; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovqb %zmm0, %xmm0
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_qb_256:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovqb %ymm0, %xmm0
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <4 x i64> %i to <4 x i8>
- ret <4 x i8> %x
-}
-
-define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 {
-; KNL-LABEL: trunc_qb_256_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovqb %zmm0, %xmm0
-; KNL-NEXT: vmovd %xmm0, (%rdi)
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_qb_256_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovqb %ymm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <4 x i64> %i to <4 x i8>
- store <4 x i8> %x, <4 x i8>* %res
- ret void
-}
-
-define <2 x i8> @trunc_qb_128(<2 x i64> %i) #0 {
-; ALL-LABEL: trunc_qb_128:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; ALL-NEXT: retq
- %x = trunc <2 x i64> %i to <2 x i8>
- ret <2 x i8> %x
-}
-
-define void @trunc_qb_128_mem(<2 x i64> %i, <2 x i8>* %res) #0 {
-; KNL-LABEL: trunc_qb_128_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; KNL-NEXT: vpextrw $0, %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_qb_128_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovqb %xmm0, (%rdi)
-; SKX-NEXT: retq
- %x = trunc <2 x i64> %i to <2 x i8>
- store <2 x i8> %x, <2 x i8>* %res
- ret void
-}
-
-define <8 x i16> @trunc_qw_512(<8 x i64> %i) #0 {
-; ALL-LABEL: trunc_qw_512:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovqw %zmm0, %xmm0
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x = trunc <8 x i64> %i to <8 x i16>
- ret <8 x i16> %x
-}
-
-define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 {
-; ALL-LABEL: trunc_qw_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovqw %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x = trunc <8 x i64> %i to <8 x i16>
- store <8 x i16> %x, <8 x i16>* %res
- ret void
-}
-
-define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 {
-; KNL-LABEL: trunc_qw_256:
-; KNL: ## %bb.0:
-; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovqw %zmm0, %xmm0
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_qw_256:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovqw %ymm0, %xmm0
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <4 x i64> %i to <4 x i16>
- ret <4 x i16> %x
-}
-
-define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 {
-; KNL-LABEL: trunc_qw_256_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovqw %zmm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, (%rdi)
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_qw_256_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovqw %ymm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <4 x i64> %i to <4 x i16>
- store <4 x i16> %x, <4 x i16>* %res
- ret void
-}
-
-define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {
-; KNL-LABEL: trunc_qw_128:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_qw_128:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; SKX-NEXT: retq
- %x = trunc <2 x i64> %i to <2 x i16>
- ret <2 x i16> %x
-}
-
-define void @trunc_qw_128_mem(<2 x i64> %i, <2 x i16>* %res) #0 {
-; KNL-LABEL: trunc_qw_128_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; KNL-NEXT: vmovd %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_qw_128_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovqw %xmm0, (%rdi)
-; SKX-NEXT: retq
- %x = trunc <2 x i64> %i to <2 x i16>
- store <2 x i16> %x, <2 x i16>* %res
- ret void
-}
-
-define <8 x i32> @trunc_qd_512(<8 x i64> %i) #0 {
-; ALL-LABEL: trunc_qd_512:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovqd %zmm0, %ymm0
-; ALL-NEXT: retq
- %x = trunc <8 x i64> %i to <8 x i32>
- ret <8 x i32> %x
-}
-
-define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 {
-; ALL-LABEL: trunc_qd_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovqd %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x = trunc <8 x i64> %i to <8 x i32>
- store <8 x i32> %x, <8 x i32>* %res
- ret void
-}
-
-define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 {
-; KNL-LABEL: trunc_qd_256:
-; KNL: ## %bb.0:
-; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_qd_256:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovqd %ymm0, %xmm0
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <4 x i64> %i to <4 x i32>
- ret <4 x i32> %x
-}
-
-define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 {
-; KNL-LABEL: trunc_qd_256_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovqd %zmm0, %ymm0
-; KNL-NEXT: vmovdqa %xmm0, (%rdi)
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_qd_256_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovqd %ymm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <4 x i64> %i to <4 x i32>
- store <4 x i32> %x, <4 x i32>* %res
- ret void
-}
-
-define <2 x i32> @trunc_qd_128(<2 x i64> %i) #0 {
-; ALL-LABEL: trunc_qd_128:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; ALL-NEXT: retq
- %x = trunc <2 x i64> %i to <2 x i32>
- ret <2 x i32> %x
-}
-
-define void @trunc_qd_128_mem(<2 x i64> %i, <2 x i32>* %res) #0 {
-; KNL-LABEL: trunc_qd_128_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; KNL-NEXT: vmovlps %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_qd_128_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovqd %xmm0, (%rdi)
-; SKX-NEXT: retq
- %x = trunc <2 x i64> %i to <2 x i32>
- store <2 x i32> %x, <2 x i32>* %res
- ret void
-}
-
-define <16 x i8> @trunc_db_512(<16 x i32> %i) #0 {
-; ALL-LABEL: trunc_db_512:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovdb %zmm0, %xmm0
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x = trunc <16 x i32> %i to <16 x i8>
- ret <16 x i8> %x
-}
-
-define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 {
-; ALL-LABEL: trunc_db_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovdb %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x = trunc <16 x i32> %i to <16 x i8>
- store <16 x i8> %x, <16 x i8>* %res
- ret void
-}
-
-define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 {
-; KNL-LABEL: trunc_db_256:
-; KNL: ## %bb.0:
-; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_db_256:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovdb %ymm0, %xmm0
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <8 x i32> %i to <8 x i8>
- ret <8 x i8> %x
-}
-
-define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 {
-; KNL-LABEL: trunc_db_256_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, (%rdi)
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_db_256_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovdb %ymm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <8 x i32> %i to <8 x i8>
- store <8 x i8> %x, <8 x i8>* %res
- ret void
-}
-
-define <4 x i8> @trunc_db_128(<4 x i32> %i) #0 {
-; ALL-LABEL: trunc_db_128:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; ALL-NEXT: retq
- %x = trunc <4 x i32> %i to <4 x i8>
- ret <4 x i8> %x
-}
-
-define void @trunc_db_128_mem(<4 x i32> %i, <4 x i8>* %res) #0 {
-; KNL-LABEL: trunc_db_128_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; KNL-NEXT: vmovd %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_db_128_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovdb %xmm0, (%rdi)
-; SKX-NEXT: retq
- %x = trunc <4 x i32> %i to <4 x i8>
- store <4 x i8> %x, <4 x i8>* %res
- ret void
-}
-
-define <16 x i16> @trunc_dw_512(<16 x i32> %i) #0 {
-; ALL-LABEL: trunc_dw_512:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovdw %zmm0, %ymm0
-; ALL-NEXT: retq
- %x = trunc <16 x i32> %i to <16 x i16>
- ret <16 x i16> %x
-}
-
-define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 {
-; ALL-LABEL: trunc_dw_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovdw %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x = trunc <16 x i32> %i to <16 x i16>
- store <16 x i16> %x, <16 x i16>* %res
- ret void
-}
-
-define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 {
-; KNL-LABEL: trunc_dw_256:
-; KNL: ## %bb.0:
-; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: ## kill: def $xmm0 killed $xmm0 killed $ymm0
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_dw_256:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovdw %ymm0, %xmm0
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <8 x i32> %i to <8 x i16>
- ret <8 x i16> %x
-}
-
-define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 {
-; KNL-LABEL: trunc_dw_256_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0
-; KNL-NEXT: vpmovdw %zmm0, %ymm0
-; KNL-NEXT: vmovdqa %xmm0, (%rdi)
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_dw_256_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovdw %ymm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <8 x i32> %i to <8 x i16>
- store <8 x i16> %x, <8 x i16>* %res
- ret void
-}
-
-define void @trunc_dw_128_mem(<4 x i32> %i, <4 x i16>* %res) #0 {
-; KNL-LABEL: trunc_dw_128_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; KNL-NEXT: vmovq %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_dw_128_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovdw %xmm0, (%rdi)
-; SKX-NEXT: retq
- %x = trunc <4 x i32> %i to <4 x i16>
- store <4 x i16> %x, <4 x i16>* %res
- ret void
-}
-
-define <32 x i8> @trunc_wb_512(<32 x i16> %i) #0 {
-; KNL-LABEL: trunc_wb_512:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; KNL-NEXT: vpmovdb %zmm1, %xmm1
-; KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_wb_512:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovwb %zmm0, %ymm0
-; SKX-NEXT: retq
- %x = trunc <32 x i16> %i to <32 x i8>
- ret <32 x i8> %x
-}
-
-define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
-; KNL-LABEL: trunc_wb_512_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; KNL-NEXT: vpmovdb %zmm1, 16(%rdi)
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT: vpmovdb %zmm0, (%rdi)
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_wb_512_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovwb %zmm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <32 x i16> %i to <32 x i8>
- store <32 x i8> %x, <32 x i8>* %res
- ret void
-}
-
-define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 {
-; KNL-LABEL: trunc_wb_256:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_wb_256:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovwb %ymm0, %xmm0
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <16 x i16> %i to <16 x i8>
- ret <16 x i8> %x
-}
-
-define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 {
-; KNL-LABEL: trunc_wb_256_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT: vpmovdb %zmm0, (%rdi)
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_wb_256_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovwb %ymm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x = trunc <16 x i16> %i to <16 x i8>
- store <16 x i8> %x, <16 x i8>* %res
- ret void
-}
-
-define <8 x i8> @trunc_wb_128(<8 x i16> %i) #0 {
-; ALL-LABEL: trunc_wb_128:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; ALL-NEXT: retq
- %x = trunc <8 x i16> %i to <8 x i8>
- ret <8 x i8> %x
-}
-
-define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 {
-; KNL-LABEL: trunc_wb_128_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; KNL-NEXT: vmovq %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: trunc_wb_128_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovwb %xmm0, (%rdi)
-; SKX-NEXT: retq
- %x = trunc <8 x i16> %i to <8 x i8>
- store <8 x i8> %x, <8 x i8>* %res
- ret void
-}
-
-
-define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
-; KNL-LABEL: usat_trunc_wb_256_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT: vpmovdb %zmm0, (%rdi)
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: usat_trunc_wb_256_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovuswb %ymm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x6 = trunc <16 x i16> %x5 to <16 x i8>
- store <16 x i8> %x6, <16 x i8>* %res, align 1
- ret void
-}
-
-define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
-; KNL-LABEL: usat_trunc_wb_256:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: usat_trunc_wb_256:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovuswb %ymm0, %xmm0
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x6 = trunc <16 x i16> %x5 to <16 x i8>
- ret <16 x i8> %x6
-}
-
-define void @usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) {
-; KNL-LABEL: usat_trunc_wb_128_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: usat_trunc_wb_128_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmovuswb %xmm0, (%rdi)
-; SKX-NEXT: retq
- %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x6 = trunc <8 x i16> %x5 to <8 x i8>
- store <8 x i8> %x6, <8 x i8>* %res, align 1
- ret void
-}
-
-define void @usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) {
-; ALL-LABEL: usat_trunc_db_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovusdb %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x3 = icmp ult <16 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %x6 = trunc <16 x i32> %x5 to <16 x i8>
- store <16 x i8> %x6, <16 x i8>* %res, align 1
- ret void
-}
-
-define void @usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) {
-; ALL-LABEL: usat_trunc_qb_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovusqb %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x3 = icmp ult <8 x i64> %i, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %x6 = trunc <8 x i64> %x5 to <8 x i8>
- store <8 x i8> %x6, <8 x i8>* %res, align 1
- ret void
-}
-
-define void @usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) {
-; ALL-LABEL: usat_trunc_qd_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovusqd %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x3 = icmp ult <8 x i64> %i, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
- %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
- %x6 = trunc <8 x i64> %x5 to <8 x i32>
- store <8 x i32> %x6, <8 x i32>* %res, align 1
- ret void
-}
-
-define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
-; ALL-LABEL: usat_trunc_qw_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovusqw %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x3 = icmp ult <8 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
- %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
- %x6 = trunc <8 x i64> %x5 to <8 x i16>
- store <8 x i16> %x6, <8 x i16>* %res, align 1
- ret void
-}
-
-define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) {
-; ALL-LABEL: usat_trunc_db_1024:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovusdb %zmm0, %xmm0
-; ALL-NEXT: vpmovusdb %zmm1, %xmm1
-; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: retq
- %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %x6 = trunc <32 x i32> %x5 to <32 x i8>
- ret <32 x i8> %x6
-}
-
-define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
-; ALL-LABEL: usat_trunc_db_1024_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovusdb %zmm0, %xmm0
-; ALL-NEXT: vpmovusdb %zmm1, %xmm1
-; ALL-NEXT: vmovdqu %xmm1, 16(%rdi)
-; ALL-NEXT: vmovdqu %xmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %x6 = trunc <32 x i32> %x5 to <32 x i8>
- store <32 x i8>%x6, <32 x i8>* %p, align 1
- ret void
-}
-
-define <16 x i16> @usat_trunc_dw_512(<16 x i32> %i) {
-; ALL-LABEL: usat_trunc_dw_512:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovusdw %zmm0, %ymm0
-; ALL-NEXT: retq
- %x3 = icmp ult <16 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
- %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
- %x6 = trunc <16 x i32> %x5 to <16 x i16>
- ret <16 x i16> %x6
-}
-
-define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) {
-; ALL-LABEL: usat_trunc_wb_128:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpminuw {{.*}}(%rip), %xmm0, %xmm0
-; ALL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; ALL-NEXT: retq
- %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x6 = trunc <8 x i16> %x5 to <8 x i8>
- ret <8 x i8>%x6
-}
-
-define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) {
-; ALL-LABEL: usat_trunc_qw_1024:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpmovusqw %zmm0, %xmm0
-; ALL-NEXT: vpmovusqw %zmm1, %xmm1
-; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: retq
- %x3 = icmp ult <16 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
- %x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
- %x6 = trunc <16 x i64> %x5 to <16 x i16>
- ret <16 x i16> %x6
-}
-
-define <16 x i8> @usat_trunc_db_256(<8 x i32> %x) {
-; KNL-LABEL: usat_trunc_db_256:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; KNL-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: usat_trunc_db_256:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; SKX-NEXT: vpmovdb %ymm0, %xmm0
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %tmp1 = icmp ult <8 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %tmp2 = select <8 x i1> %tmp1, <8 x i32> %x, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %tmp3 = trunc <8 x i32> %tmp2 to <8 x i8>
- %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- ret <16 x i8> %tmp4
-}
-
-
-
-; Tests for the following unsigned saturation pattern:
-
-; %a = icmp sgt %x, C1
-; %b = select %a, %x, C2
-; %c = icmp slt %b, C2
-; %d = select %c, %b, C2
-; %res = trunc %d
-
-
-define void @smax_usat_trunc_wb_256_mem1(<16 x i16> %i, <16 x i8>* %res) {
-; KNL-LABEL: smax_usat_trunc_wb_256_mem1:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT: vpmovdb %zmm0, (%rdi)
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: smax_usat_trunc_wb_256_mem1:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; SKX-NEXT: vpmovuswb %ymm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x1 = icmp sgt <16 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
- %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
- %x3 = icmp slt <16 x i16> %x2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x6 = trunc <16 x i16> %x5 to <16 x i8>
- store <16 x i8> %x6, <16 x i8>* %res, align 1
- ret void
-}
-
-; Test for smax(smin(x, C2), C1).
-define void @smax_usat_trunc_wb_256_mem2(<16 x i16> %i, <16 x i8>* %res) {
-; KNL-LABEL: smax_usat_trunc_wb_256_mem2:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT: vpmovdb %zmm0, (%rdi)
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: smax_usat_trunc_wb_256_mem2:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; SKX-NEXT: vpmovuswb %ymm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x1 = icmp slt <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x3 = icmp sgt <16 x i16> %x2, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
- %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
- %x6 = trunc <16 x i16> %x5 to <16 x i8>
- store <16 x i8> %x6, <16 x i8>* %res, align 1
- ret void
-}
-
-define <16 x i8> @smax_usat_trunc_wb_256(<16 x i16> %i) {
-; KNL-LABEL: smax_usat_trunc_wb_256:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT: vpmovdb %zmm0, %xmm0
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: smax_usat_trunc_wb_256:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; SKX-NEXT: vpmovuswb %ymm0, %xmm0
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x1 = icmp sgt <16 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
- %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
- %x3 = icmp slt <16 x i16> %x2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x6 = trunc <16 x i16> %x5 to <16 x i8>
- ret <16 x i8> %x6
- }
-
-define void @smax_usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) {
-; KNL-LABEL: smax_usat_trunc_wb_128_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; KNL-NEXT: vpminsw {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; KNL-NEXT: vmovq %xmm0, (%rdi)
-; KNL-NEXT: retq
-;
-; SKX-LABEL: smax_usat_trunc_wb_128_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; SKX-NEXT: vpmovuswb %xmm0, (%rdi)
-; SKX-NEXT: retq
- %x1 = icmp sgt <8 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
- %x2 = select <8 x i1> %x1, <8 x i16> %i, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
- %x3 = icmp slt <8 x i16> %x2, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x5 = select <8 x i1> %x3, <8 x i16> %x2, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %x6 = trunc <8 x i16> %x5 to <8 x i8>
- store <8 x i8> %x6, <8 x i8>* %res, align 1
- ret void
-}
-
-define void @smax_usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) {
-; ALL-LABEL: smax_usat_trunc_db_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
-; ALL-NEXT: vpmovusdb %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x1 = icmp sgt <16 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %x3 = icmp slt <16 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %x5 = select <16 x i1> %x3, <16 x i32> %x2, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %x6 = trunc <16 x i32> %x5 to <16 x i8>
- store <16 x i8> %x6, <16 x i8>* %res, align 1
- ret void
-}
-
-define void @smax_usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) {
-; ALL-LABEL: smax_usat_trunc_qb_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; ALL-NEXT: vpmovusqb %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x1 = icmp sgt <8 x i64> %i, <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
- %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
- %x3 = icmp slt <8 x i64> %x2, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %x6 = trunc <8 x i64> %x5 to <8 x i8>
- store <8 x i8> %x6, <8 x i8>* %res, align 1
- ret void
-}
-
-define void @smax_usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) {
-; ALL-LABEL: smax_usat_trunc_qd_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; ALL-NEXT: vpmovusqd %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x1 = icmp sgt <8 x i64> %i, <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
- %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
- %x3 = icmp slt <8 x i64> %x2, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
- %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
- %x6 = trunc <8 x i64> %x5 to <8 x i32>
- store <8 x i32> %x6, <8 x i32>* %res, align 1
- ret void
-}
-
-define void @smax_usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
-; ALL-LABEL: smax_usat_trunc_qw_512_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; ALL-NEXT: vpmovusqw %zmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x1 = icmp sgt <8 x i64> %i, <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
- %x2 = select <8 x i1> %x1, <8 x i64> %i, <8 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>
- %x3 = icmp slt <8 x i64> %x2, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
- %x5 = select <8 x i1> %x3, <8 x i64> %x2, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
- %x6 = trunc <8 x i64> %x5 to <8 x i16>
- store <8 x i16> %x6, <8 x i16>* %res, align 1
- ret void
-}
-
-define <32 x i8> @smax_usat_trunc_db_1024(<32 x i32> %i) {
-; ALL-LABEL: smax_usat_trunc_db_1024:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1
-; ALL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0
-; ALL-NEXT: vpmovusdb %zmm0, %xmm0
-; ALL-NEXT: vpmovusdb %zmm1, %xmm1
-; ALL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; ALL-NEXT: retq
- %x1 = icmp sgt <32 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %x3 = icmp slt <32 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %x5 = select <32 x i1> %x3, <32 x i32> %x2, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %x6 = trunc <32 x i32> %x5 to <32 x i8>
- ret <32 x i8> %x6
-}
-
-define void @smax_usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
-; ALL-LABEL: smax_usat_trunc_db_1024_mem:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; ALL-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1
-; ALL-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0
-; ALL-NEXT: vpmovusdb %zmm0, %xmm0
-; ALL-NEXT: vpmovusdb %zmm1, %xmm1
-; ALL-NEXT: vmovdqu %xmm1, 16(%rdi)
-; ALL-NEXT: vmovdqu %xmm0, (%rdi)
-; ALL-NEXT: vzeroupper
-; ALL-NEXT: retq
- %x1 = icmp sgt <32 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %x2 = select <32 x i1> %x1, <32 x i32> %i, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %x3 = icmp slt <32 x i32> %x2, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %x5 = select <32 x i1> %x3, <32 x i32> %x2, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %x6 = trunc <32 x i32> %x5 to <32 x i8>
- store <32 x i8>%x6, <32 x i8>* %p, align 1
- ret void
-}
-
-define <16 x i16> @smax_usat_trunc_dw_512(<16 x i32> %i) {
-; ALL-LABEL: smax_usat_trunc_dw_512:
-; ALL: ## %bb.0:
-; ALL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; ALL-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
-; ALL-NEXT: vpmovusdw %zmm0, %ymm0
-; ALL-NEXT: retq
- %x1 = icmp sgt <16 x i32> %i, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %x2 = select <16 x i1> %x1, <16 x i32> %i, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
- %x3 = icmp slt <16 x i32> %x2, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
- %x5 = select <16 x i1> %x3, <16 x i32> %x2, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
- %x6 = trunc <16 x i32> %x5 to <16 x i16>
- ret <16 x i16> %x6
-}
-
-define void @negative_test1_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
-; KNL-LABEL: negative_test1_smax_usat_trunc_wb_256_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; KNL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; KNL-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT: vpmovdb %zmm0, (%rdi)
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: negative_test1_smax_usat_trunc_wb_256_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; SKX-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; SKX-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; SKX-NEXT: vpmovwb %ymm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x1 = icmp sgt <16 x i16> %i, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
- %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
- %x3 = icmp slt <16 x i16> %x2, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
- %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
- %x6 = trunc <16 x i16> %x5 to <16 x i8>
- store <16 x i8> %x6, <16 x i8>* %res, align 1
- ret void
-}
-
-define void @negative_test2_smax_usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
-; KNL-LABEL: negative_test2_smax_usat_trunc_wb_256_mem:
-; KNL: ## %bb.0:
-; KNL-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; KNL-NEXT: vpmovdb %zmm0, (%rdi)
-; KNL-NEXT: vzeroupper
-; KNL-NEXT: retq
-;
-; SKX-LABEL: negative_test2_smax_usat_trunc_wb_256_mem:
-; SKX: ## %bb.0:
-; SKX-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
-; SKX-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; SKX-NEXT: vpmovwb %ymm0, (%rdi)
-; SKX-NEXT: vzeroupper
-; SKX-NEXT: retq
- %x1 = icmp sgt <16 x i16> %i, <i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10>
- %x2 = select <16 x i1> %x1, <16 x i16> %i, <16 x i16> <i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10, i16 -10>
- %x3 = icmp slt <16 x i16> %x2, <i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5>
- %x5 = select <16 x i1> %x3, <16 x i16> %x2, <16 x i16> <i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5, i16 -5>
- %x6 = trunc <16 x i16> %x5 to <16 x i8>
- store <16 x i8> %x6, <16 x i8>* %res, align 1
- ret void
-}
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-SSE --check-prefix=CHECK-NOSSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-SSE --check-prefix=CHECK-SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-AVX --check-prefix=CHECK-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-ALL --check-prefix=CHECK-WIDE-AVX --check-prefix=CHECK-WIDE-AVX2
declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: test1:
-; CHECK-WIDE-AVX: # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
ret <8 x i16> %r
; CHECK-AVX: # %bb.0:
; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: test2:
-; CHECK-WIDE-AVX: # %bb.0:
-; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-WIDE-AVX-NEXT: retq
%r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v)
ret <4 x i32> %r
}
; CHECK-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: or_bswap:
-; CHECK-WIDE-AVX: # %bb.0:
-; CHECK-WIDE-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
-; CHECK-WIDE-AVX-NEXT: retq
%xt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %x)
%yt = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %y)
%r = or <4 x i32> %xt, %yt
; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: test3:
-; CHECK-WIDE-AVX: # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
-; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
ret <2 x i64> %r
; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: test4:
-; CHECK-WIDE-AVX: # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
-; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v)
ret <16 x i16> %r
; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: test5:
-; CHECK-WIDE-AVX: # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
-; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v)
ret <8 x i32> %r
; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: test6:
-; CHECK-WIDE-AVX: # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
-; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v)
ret <4 x i64> %r
; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: test7:
-; CHECK-WIDE-AVX: # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
-; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v)
ret <4 x i16> %r
; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: fold_v8i16:
-; CHECK-WIDE-AVX: # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,256,65535,512,65023,1024,64511,1536]
-; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> <i16 0, i16 1, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6>)
ret <8 x i16> %r
; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: fold_v4i32:
-; CHECK-WIDE-AVX: # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [0,4294967295,33554432,4261412863]
-; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> <i32 0, i32 -1, i32 2, i32 -3>)
ret <4 x i32> %r
; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: fold_v2i64:
-; CHECK-WIDE-AVX: # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} xmm0 = [18374686479671623680,18446744073709551615]
-; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> <i64 255, i64 -1>)
ret <2 x i64> %r
; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: fold_v16i16:
-; CHECK-WIDE-AVX: # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,256,65535,512,65023,1024,64511,1536,63999,2048,63487,2560,62975,3072,62463,3584]
-; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> <i16 0, i16 1, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14>)
ret <16 x i16> %r
; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: fold_v8i32:
-; CHECK-WIDE-AVX: # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,16777216,4294967295,33554432,4261412863,67108864,4227858431,100663296]
-; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> <i32 0, i32 1, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6>)
ret <8 x i32> %r
; CHECK-AVX: # %bb.0: # %entry
; CHECK-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160]
; CHECK-AVX-NEXT: retq
-;
-; CHECK-WIDE-AVX-LABEL: fold_v4i64:
-; CHECK-WIDE-AVX: # %bb.0: # %entry
-; CHECK-WIDE-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [18374686479671623680,18446744073709551615,18446462598732840960,72056494526300160]
-; CHECK-WIDE-AVX-NEXT: retq
entry:
%r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> <i64 255, i64 -1, i64 65535, i64 16776960>)
ret <4 x i64> %r
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=core2 -mattr=+sse2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=core2 -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
; FIXME: Ideally we should be able to fold the entire body of @test1 into a
; single paddd instruction. At the moment we produce the sequence
; CHECK: # %bb.0:
; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
-;
-; CHECK-WIDE-LABEL: test1:
-; CHECK-WIDE: # %bb.0:
-; CHECK-WIDE-NEXT: paddd {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <2 x i32>
%add = add <2 x i32> %1, <i32 3, i32 5>
%2 = bitcast <2 x i32> %add to double
; CHECK: # %bb.0:
; CHECK-NEXT: paddd %xmm1, %xmm0
; CHECK-NEXT: retq
-;
-; CHECK-WIDE-LABEL: test2:
-; CHECK-WIDE: # %bb.0:
-; CHECK-WIDE-NEXT: paddd %xmm1, %xmm0
-; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <2 x i32>
%2 = bitcast double %B to <2 x i32>
%add = add <2 x i32> %1, %2
; CHECK-NEXT: addps {{.*}}(%rip), %xmm0
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: retq
-;
-; CHECK-WIDE-LABEL: test3:
-; CHECK-WIDE: # %bb.0:
-; CHECK-WIDE-NEXT: movq %rdi, %xmm0
-; CHECK-WIDE-NEXT: addps {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT: movq %xmm0, %rax
-; CHECK-WIDE-NEXT: retq
%1 = bitcast i64 %A to <2 x float>
%add = fadd <2 x float> %1, <float 3.0, float 5.0>
%2 = bitcast <2 x float> %add to i64
; CHECK-NEXT: paddd {{.*}}(%rip), %xmm0
; CHECK-NEXT: movq %xmm0, %rax
; CHECK-NEXT: retq
-;
-; CHECK-WIDE-LABEL: test4:
-; CHECK-WIDE: # %bb.0:
-; CHECK-WIDE-NEXT: movq %rdi, %xmm0
-; CHECK-WIDE-NEXT: paddd {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT: movq %xmm0, %rax
-; CHECK-WIDE-NEXT: retq
%1 = bitcast i64 %A to <2 x i32>
%add = add <2 x i32> %1, <i32 3, i32 5>
%2 = bitcast <2 x i32> %add to i64
; CHECK: # %bb.0:
; CHECK-NEXT: addps {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
-;
-; CHECK-WIDE-LABEL: test5:
-; CHECK-WIDE: # %bb.0:
-; CHECK-WIDE-NEXT: addps {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <2 x float>
%add = fadd <2 x float> %1, <float 3.0, float 5.0>
%2 = bitcast <2 x float> %add to double
; CHECK: # %bb.0:
; CHECK-NEXT: paddw {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
-;
-; CHECK-WIDE-LABEL: test6:
-; CHECK-WIDE: # %bb.0:
-; CHECK-WIDE-NEXT: paddw {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <4 x i16>
%add = add <4 x i16> %1, <i16 3, i16 4, i16 5, i16 6>
%2 = bitcast <4 x i16> %add to double
; CHECK: # %bb.0:
; CHECK-NEXT: paddw %xmm1, %xmm0
; CHECK-NEXT: retq
-;
-; CHECK-WIDE-LABEL: test7:
-; CHECK-WIDE: # %bb.0:
-; CHECK-WIDE-NEXT: paddw %xmm1, %xmm0
-; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <4 x i16>
%2 = bitcast double %B to <4 x i16>
%add = add <4 x i16> %1, %2
; CHECK: # %bb.0:
; CHECK-NEXT: paddb {{.*}}(%rip), %xmm0
; CHECK-NEXT: retq
-;
-; CHECK-WIDE-LABEL: test8:
-; CHECK-WIDE: # %bb.0:
-; CHECK-WIDE-NEXT: paddb {{.*}}(%rip), %xmm0
-; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <8 x i8>
%add = add <8 x i8> %1, <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10>
%2 = bitcast <8 x i8> %add to double
; CHECK: # %bb.0:
; CHECK-NEXT: paddb %xmm1, %xmm0
; CHECK-NEXT: retq
-;
-; CHECK-WIDE-LABEL: test9:
-; CHECK-WIDE: # %bb.0:
-; CHECK-WIDE-NEXT: paddb %xmm1, %xmm0
-; CHECK-WIDE-NEXT: retq
%1 = bitcast double %A to <8 x i8>
%2 = bitcast double %B to <8 x i8>
%add = add <8 x i8> %1, %2
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -x86-experimental-vector-widening-legalization < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_SKX
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -x86-experimental-vector-widening-legalization < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_KNL
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=CHECK --check-prefix=PROMOTE --check-prefix=PROMOTE_SKX
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=CHECK --check-prefix=PROMOTE --check-prefix=PROMOTE_KNL
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -x86-experimental-vector-widening-legalization < %s | FileCheck %s --check-prefix=WIDEN_AVX2
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s --check-prefix=PROMOTE_AVX2
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_SKX
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=CHECK --check-prefix=WIDEN --check-prefix=WIDEN_KNL
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s --check-prefix=WIDEN_AVX2
define <2 x double> @test_gather_v2i32_index(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) {
; WIDEN_SKX-LABEL: test_gather_v2i32_index:
; WIDEN_KNL-NEXT: vzeroupper
; WIDEN_KNL-NEXT: retq
;
-; PROMOTE_SKX-LABEL: test_gather_v2i32_index:
-; PROMOTE_SKX: # %bb.0:
-; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1
-; PROMOTE_SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %xmm2 {%k1}
-; PROMOTE_SKX-NEXT: vmovapd %xmm2, %xmm0
-; PROMOTE_SKX-NEXT: retq
-;
-; PROMOTE_KNL-LABEL: test_gather_v2i32_index:
-; PROMOTE_KNL: # %bb.0:
-; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
-; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1
-; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0
-; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0
-; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k1}
-; PROMOTE_KNL-NEXT: vmovapd %xmm2, %xmm0
-; PROMOTE_KNL-NEXT: vzeroupper
-; PROMOTE_KNL-NEXT: retq
-;
; WIDEN_AVX2-LABEL: test_gather_v2i32_index:
; WIDEN_AVX2: # %bb.0:
; WIDEN_AVX2-NEXT: vpsllq $63, %xmm1, %xmm1
; WIDEN_AVX2-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2
; WIDEN_AVX2-NEXT: vmovapd %xmm2, %xmm0
; WIDEN_AVX2-NEXT: retq
-;
-; PROMOTE_AVX2-LABEL: test_gather_v2i32_index:
-; PROMOTE_AVX2: # %bb.0:
-; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2
-; PROMOTE_AVX2-NEXT: vmovapd %xmm2, %xmm0
-; PROMOTE_AVX2-NEXT: retq
%gep.random = getelementptr double, double* %base, <2 x i32> %ind
%res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
ret <2 x double> %res
; WIDEN_KNL-NEXT: vzeroupper
; WIDEN_KNL-NEXT: retq
;
-; PROMOTE_SKX-LABEL: test_scatter_v2i32_index:
-; PROMOTE_SKX: # %bb.0:
-; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1
-; PROMOTE_SKX-NEXT: vscatterdpd %xmm0, (%rdi,%xmm1,8) {%k1}
-; PROMOTE_SKX-NEXT: retq
-;
-; PROMOTE_KNL-LABEL: test_scatter_v2i32_index:
-; PROMOTE_KNL: # %bb.0:
-; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
-; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2
-; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0
-; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0
-; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT: vscatterdpd %zmm0, (%rdi,%ymm1,8) {%k1}
-; PROMOTE_KNL-NEXT: vzeroupper
-; PROMOTE_KNL-NEXT: retq
-;
; WIDEN_AVX2-LABEL: test_scatter_v2i32_index:
; WIDEN_AVX2: # %bb.0:
; WIDEN_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1
; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax
; WIDEN_AVX2-NEXT: vmovhps %xmm0, (%rax)
; WIDEN_AVX2-NEXT: retq
-;
-; PROMOTE_AVX2-LABEL: test_scatter_v2i32_index:
-; PROMOTE_AVX2: # %bb.0:
-; PROMOTE_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT: vpsllq $3, %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3
-; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3
-; PROMOTE_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2
-; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax
-; PROMOTE_AVX2-NEXT: testb $1, %al
-; PROMOTE_AVX2-NEXT: jne .LBB1_1
-; PROMOTE_AVX2-NEXT: # %bb.2: # %else
-; PROMOTE_AVX2-NEXT: testb $2, %al
-; PROMOTE_AVX2-NEXT: jne .LBB1_3
-; PROMOTE_AVX2-NEXT: .LBB1_4: # %else2
-; PROMOTE_AVX2-NEXT: retq
-; PROMOTE_AVX2-NEXT: .LBB1_1: # %cond.store
-; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx
-; PROMOTE_AVX2-NEXT: vmovlps %xmm0, (%rcx)
-; PROMOTE_AVX2-NEXT: testb $2, %al
-; PROMOTE_AVX2-NEXT: je .LBB1_4
-; PROMOTE_AVX2-NEXT: .LBB1_3: # %cond.store1
-; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; PROMOTE_AVX2-NEXT: vmovhps %xmm0, (%rax)
-; PROMOTE_AVX2-NEXT: retq
%gep = getelementptr double, double *%base, <2 x i32> %ind
call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %a1, <2 x double*> %gep, i32 4, <2 x i1> %mask)
ret void
; WIDEN_KNL-NEXT: vzeroupper
; WIDEN_KNL-NEXT: retq
;
-; PROMOTE_SKX-LABEL: test_gather_v2i32_data:
-; PROMOTE_SKX: # %bb.0:
-; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1
-; PROMOTE_SKX-NEXT: vpgatherqd (,%xmm0), %xmm2 {%k1}
-; PROMOTE_SKX-NEXT: vmovdqa %xmm2, %xmm0
-; PROMOTE_SKX-NEXT: retq
-;
-; PROMOTE_KNL-LABEL: test_gather_v2i32_data:
-; PROMOTE_KNL: # %bb.0:
-; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2
-; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1
-; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0
-; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0
-; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1}
-; PROMOTE_KNL-NEXT: vmovdqa %xmm2, %xmm0
-; PROMOTE_KNL-NEXT: vzeroupper
-; PROMOTE_KNL-NEXT: retq
-;
; WIDEN_AVX2-LABEL: test_gather_v2i32_data:
; WIDEN_AVX2: # %bb.0:
; WIDEN_AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; WIDEN_AVX2-NEXT: vpgatherqd %xmm1, (,%xmm0), %xmm2
; WIDEN_AVX2-NEXT: vmovdqa %xmm2, %xmm0
; WIDEN_AVX2-NEXT: retq
-;
-; PROMOTE_AVX2-LABEL: test_gather_v2i32_data:
-; PROMOTE_AVX2: # %bb.0:
-; PROMOTE_AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; PROMOTE_AVX2-NEXT: vpslld $31, %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT: vpgatherqd %xmm1, (,%xmm0), %xmm2
-; PROMOTE_AVX2-NEXT: vmovdqa %xmm2, %xmm0
-; PROMOTE_AVX2-NEXT: retq
%res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %ptr, i32 4, <2 x i1> %mask, <2 x i32> %src0)
ret <2 x i32>%res
}
; WIDEN_KNL-NEXT: vzeroupper
; WIDEN_KNL-NEXT: retq
;
-; PROMOTE_SKX-LABEL: test_scatter_v2i32_data:
-; PROMOTE_SKX: # %bb.0:
-; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1
-; PROMOTE_SKX-NEXT: vpscatterqd %xmm0, (,%xmm1) {%k1}
-; PROMOTE_SKX-NEXT: retq
-;
-; PROMOTE_KNL-LABEL: test_scatter_v2i32_data:
-; PROMOTE_KNL: # %bb.0:
-; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2
-; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0
-; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0
-; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
-; PROMOTE_KNL-NEXT: vzeroupper
-; PROMOTE_KNL-NEXT: retq
-;
; WIDEN_AVX2-LABEL: test_scatter_v2i32_data:
; WIDEN_AVX2: # %bb.0:
; WIDEN_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2
; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax
; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax)
; WIDEN_AVX2-NEXT: retq
-;
-; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data:
-; PROMOTE_AVX2: # %bb.0:
-; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2
-; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax
-; PROMOTE_AVX2-NEXT: testb $1, %al
-; PROMOTE_AVX2-NEXT: jne .LBB3_1
-; PROMOTE_AVX2-NEXT: # %bb.2: # %else
-; PROMOTE_AVX2-NEXT: testb $2, %al
-; PROMOTE_AVX2-NEXT: jne .LBB3_3
-; PROMOTE_AVX2-NEXT: .LBB3_4: # %else2
-; PROMOTE_AVX2-NEXT: retq
-; PROMOTE_AVX2-NEXT: .LBB3_1: # %cond.store
-; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx
-; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rcx)
-; PROMOTE_AVX2-NEXT: testb $2, %al
-; PROMOTE_AVX2-NEXT: je .LBB3_4
-; PROMOTE_AVX2-NEXT: .LBB3_3: # %cond.store1
-; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; PROMOTE_AVX2-NEXT: vextractps $1, %xmm0, (%rax)
-; PROMOTE_AVX2-NEXT: retq
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
ret void
}
; WIDEN_KNL-NEXT: vzeroupper
; WIDEN_KNL-NEXT: retq
;
-; PROMOTE_SKX-LABEL: test_gather_v2i32_data_index:
-; PROMOTE_SKX: # %bb.0:
-; PROMOTE_SKX-NEXT: vpsllq $63, %xmm1, %xmm1
-; PROMOTE_SKX-NEXT: vpmovq2m %xmm1, %k1
-; PROMOTE_SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm2 {%k1}
-; PROMOTE_SKX-NEXT: vmovdqa %xmm2, %xmm0
-; PROMOTE_SKX-NEXT: retq
-;
-; PROMOTE_KNL-LABEL: test_gather_v2i32_data_index:
-; PROMOTE_KNL: # %bb.0:
-; PROMOTE_KNL-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2
-; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; PROMOTE_KNL-NEXT: vpsllq $63, %xmm1, %xmm1
-; PROMOTE_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0
-; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0
-; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
-; PROMOTE_KNL-NEXT: vmovdqa %xmm2, %xmm0
-; PROMOTE_KNL-NEXT: vzeroupper
-; PROMOTE_KNL-NEXT: retq
-;
; WIDEN_AVX2-LABEL: test_gather_v2i32_data_index:
; WIDEN_AVX2: # %bb.0:
; WIDEN_AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
; WIDEN_AVX2-NEXT: vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2
; WIDEN_AVX2-NEXT: vmovdqa %xmm2, %xmm0
; WIDEN_AVX2-NEXT: retq
-;
-; PROMOTE_AVX2-LABEL: test_gather_v2i32_data_index:
-; PROMOTE_AVX2: # %bb.0:
-; PROMOTE_AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; PROMOTE_AVX2-NEXT: vpslld $31, %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT: vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2
-; PROMOTE_AVX2-NEXT: vmovdqa %xmm2, %xmm0
-; PROMOTE_AVX2-NEXT: retq
%gep.random = getelementptr i32, i32* %base, <2 x i32> %ind
%res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
ret <2 x i32> %res
; WIDEN_KNL-NEXT: vzeroupper
; WIDEN_KNL-NEXT: retq
;
-; PROMOTE_SKX-LABEL: test_scatter_v2i32_data_index:
-; PROMOTE_SKX: # %bb.0:
-; PROMOTE_SKX-NEXT: vpsllq $63, %xmm2, %xmm2
-; PROMOTE_SKX-NEXT: vpmovq2m %xmm2, %k1
-; PROMOTE_SKX-NEXT: vpscatterdd %xmm0, (%rdi,%xmm1,4) {%k1}
-; PROMOTE_SKX-NEXT: retq
-;
-; PROMOTE_KNL-LABEL: test_scatter_v2i32_data_index:
-; PROMOTE_KNL: # %bb.0:
-; PROMOTE_KNL-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; PROMOTE_KNL-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; PROMOTE_KNL-NEXT: vpsllq $63, %xmm2, %xmm2
-; PROMOTE_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0
-; PROMOTE_KNL-NEXT: kshiftlw $14, %k0, %k0
-; PROMOTE_KNL-NEXT: kshiftrw $14, %k0, %k1
-; PROMOTE_KNL-NEXT: vpscatterdd %zmm0, (%rdi,%zmm1,4) {%k1}
-; PROMOTE_KNL-NEXT: vzeroupper
-; PROMOTE_KNL-NEXT: retq
-;
; WIDEN_AVX2-LABEL: test_scatter_v2i32_data_index:
; WIDEN_AVX2: # %bb.0:
; WIDEN_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1
; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax
; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax)
; WIDEN_AVX2-NEXT: retq
-;
-; PROMOTE_AVX2-LABEL: test_scatter_v2i32_data_index:
-; PROMOTE_AVX2: # %bb.0:
-; PROMOTE_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT: vpsllq $2, %xmm1, %xmm1
-; PROMOTE_AVX2-NEXT: vmovq %rdi, %xmm3
-; PROMOTE_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3
-; PROMOTE_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; PROMOTE_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2
-; PROMOTE_AVX2-NEXT: vmovmskpd %xmm2, %eax
-; PROMOTE_AVX2-NEXT: testb $1, %al
-; PROMOTE_AVX2-NEXT: jne .LBB5_1
-; PROMOTE_AVX2-NEXT: # %bb.2: # %else
-; PROMOTE_AVX2-NEXT: testb $2, %al
-; PROMOTE_AVX2-NEXT: jne .LBB5_3
-; PROMOTE_AVX2-NEXT: .LBB5_4: # %else2
-; PROMOTE_AVX2-NEXT: retq
-; PROMOTE_AVX2-NEXT: .LBB5_1: # %cond.store
-; PROMOTE_AVX2-NEXT: vmovq %xmm1, %rcx
-; PROMOTE_AVX2-NEXT: vmovss %xmm0, (%rcx)
-; PROMOTE_AVX2-NEXT: testb $2, %al
-; PROMOTE_AVX2-NEXT: je .LBB5_4
-; PROMOTE_AVX2-NEXT: .LBB5_3: # %cond.store1
-; PROMOTE_AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; PROMOTE_AVX2-NEXT: vextractps $1, %xmm0, (%rax)
-; PROMOTE_AVX2-NEXT: retq
%gep = getelementptr i32, i32 *%base, <2 x i32> %ind
call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %gep, i32 4, <2 x i1> %mask)
ret void
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-PROMOTE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=SSE --check-prefix=SSE2 --check-prefix=SSE2-WIDEN
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-PROMOTE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=SSE --check-prefix=SSE41 --check-prefix=SSE41-WIDEN
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
-
-@c = external global i32*, align 8
-
-; %val1 = load <2 x i8>
-; %op1 = zext<2 x i32> %val1
-; %val2 = load <2 x i8>
-; %op2 = zext<2 x i32> %val2
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_2xi8:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
-; X86-SSE-NEXT: movd %edx, %xmm0
-; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
-; X86-SSE-NEXT: movd %eax, %xmm1
-; X86-SSE-NEXT: pxor %xmm2, %xmm2
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi8:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
-; X86-AVX-NEXT: vmovd %edx, %xmm0
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
-; X86-AVX-NEXT: vmovd %eax, %xmm1
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi8:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm1
-; X64-SSE-NEXT: pxor %xmm2, %xmm2
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT: pmullw %xmm0, %xmm1
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi8:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm1
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
- %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
- %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
- %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
- %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val1 = load <4 x i8>
-; %op1 = zext<4 x i32> %val1
-; %val2 = load <4 x i8>
-; %op2 = zext<4 x i32> %val2
-; %rst = mul <4 x i32> %op1, %op2
-;
-define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_4xi8:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pxor %xmm2, %xmm2
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_4xi8:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_4xi8:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: pxor %xmm2, %xmm2
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT: pmullw %xmm0, %xmm1
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_4xi8:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <4 x i8>*
- %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
- %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <4 x i8>*
- %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
- %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
- %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
- store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val1 = load <8 x i8>
-; %op1 = zext<8 x i32> %val1
-; %val2 = load <8 x i8>
-; %op2 = zext<8 x i32> %val2
-; %rst = mul <8 x i32> %op1, %op2
-;
-define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_8xi8:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE-NEXT: pxor %xmm2, %xmm2
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
-; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX1-LABEL: mul_8xi8:
-; X86-AVX1: # %bb.0: # %entry
-; X86-AVX1-NEXT: pushl %esi
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT: movl c, %esi
-; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
-; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
-; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4)
-; X86-AVX1-NEXT: popl %esi
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX2-LABEL: mul_8xi8:
-; X86-AVX2: # %bb.0: # %entry
-; X86-AVX2-NEXT: pushl %esi
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT: movl c, %esi
-; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
-; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
-; X86-AVX2-NEXT: popl %esi
-; X86-AVX2-NEXT: vzeroupper
-; X86-AVX2-NEXT: retl
-;
-; X64-SSE-LABEL: mul_8xi8:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE-NEXT: pxor %xmm2, %xmm2
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT: pmullw %xmm0, %xmm1
-; X64-SSE-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
-; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX1-LABEL: mul_8xi8:
-; X64-AVX1: # %bb.0: # %entry
-; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4)
-; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: mul_8xi8:
-; X64-AVX2: # %bb.0: # %entry
-; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0
-; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
-; X64-AVX2-NEXT: vzeroupper
-; X64-AVX2-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <8 x i8>*
- %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
- %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <8 x i8>*
- %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
- %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
- %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
- store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val1 = load <16 x i8>
-; %op1 = zext<16 x i32> %val1
-; %val2 = load <16 x i8>
-; %op2 = zext<16 x i32> %val2
-; %rst = mul <16 x i32> %op1, %op2
-;
-define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_16xi8:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
-; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
-; X86-SSE-NEXT: pxor %xmm2, %xmm2
-; X86-SSE-NEXT: movdqa %xmm0, %xmm3
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X86-SSE-NEXT: movdqa %xmm1, %xmm4
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X86-SSE-NEXT: pmullw %xmm3, %xmm4
-; X86-SSE-NEXT: movdqa %xmm4, %xmm3
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; X86-SSE-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
-; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4)
-; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4)
-; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX1-LABEL: mul_16xi8:
-; X86-AVX1: # %bb.0: # %entry
-; X86-AVX1-NEXT: pushl %esi
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT: movl c, %esi
-; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
-; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
-; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4)
-; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4)
-; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4)
-; X86-AVX1-NEXT: popl %esi
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX2-LABEL: mul_16xi8:
-; X86-AVX2: # %bb.0: # %entry
-; X86-AVX2-NEXT: pushl %esi
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT: movl c, %esi
-; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
-; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
-; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
-; X86-AVX2-NEXT: popl %esi
-; X86-AVX2-NEXT: vzeroupper
-; X86-AVX2-NEXT: retl
-;
-; X64-SSE-LABEL: mul_16xi8:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
-; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
-; X64-SSE-NEXT: pxor %xmm2, %xmm2
-; X64-SSE-NEXT: movdqa %xmm0, %xmm3
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-SSE-NEXT: movdqa %xmm1, %xmm4
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X64-SSE-NEXT: pmullw %xmm3, %xmm4
-; X64-SSE-NEXT: movdqa %xmm4, %xmm3
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; X64-SSE-NEXT: pmullw %xmm0, %xmm1
-; X64-SSE-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
-; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4)
-; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4)
-; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX1-LABEL: mul_16xi8:
-; X64-AVX1: # %bb.0: # %entry
-; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0
-; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1
-; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
-; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
-; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4)
-; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4)
-; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4)
-; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: mul_16xi8:
-; X64-AVX2: # %bb.0: # %entry
-; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0
-; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1
-; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
-; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
-; X64-AVX2-NEXT: vzeroupper
-; X64-AVX2-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
- %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
- %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <16 x i8>*
- %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
- %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
- %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
- store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val1 = load <2 x i16>
-; %op1 = zext<2 x i32> %val1
-; %val2 = load <2 x i16>
-; %op2 = zext<2 x i32> %val2
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_2xi16:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
-; X86-SSE-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi16:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi16:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movdqa %xmm1, %xmm2
-; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
-; X64-SSE-NEXT: pmullw %xmm0, %xmm1
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi16:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
- %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
- %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
- %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
- %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val1 = load <4 x i16>
-; %op1 = zext<4 x i32> %val1
-; %val2 = load <4 x i16>
-; %op2 = zext<4 x i32> %val2
-; %rst = mul <4 x i32> %op1, %op2
-;
-define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_4xi16:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
-; X86-SSE-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_4xi16:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_4xi16:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE-NEXT: movdqa %xmm1, %xmm2
-; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
-; X64-SSE-NEXT: pmullw %xmm0, %xmm1
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_4xi16:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <4 x i16>*
- %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
- %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <4 x i16>*
- %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
- %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
- %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
- store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val1 = load <8 x i16>
-; %op1 = zext<8 x i32> %val1
-; %val2 = load <8 x i16>
-; %op2 = zext<8 x i32> %val2
-; %rst = mul <8 x i32> %op1, %op2
-;
-define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_8xi16:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
-; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1
-; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2
-; X86-SSE-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4)
-; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX1-LABEL: mul_8xi16:
-; X86-AVX1: # %bb.0: # %entry
-; X86-AVX1-NEXT: pushl %esi
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT: movl c, %esi
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
-; X86-AVX1-NEXT: vmovdqu %xmm0, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT: vmovdqu %xmm1, (%esi,%ecx,4)
-; X86-AVX1-NEXT: popl %esi
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX2-LABEL: mul_8xi16:
-; X86-AVX2: # %bb.0: # %entry
-; X86-AVX2-NEXT: pushl %esi
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT: movl c, %esi
-; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
-; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4)
-; X86-AVX2-NEXT: popl %esi
-; X86-AVX2-NEXT: vzeroupper
-; X86-AVX2-NEXT: retl
-;
-; X64-SSE-LABEL: mul_8xi16:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
-; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1
-; X64-SSE-NEXT: movdqa %xmm1, %xmm2
-; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2
-; X64-SSE-NEXT: pmullw %xmm0, %xmm1
-; X64-SSE-NEXT: movdqa %xmm1, %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4)
-; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX1-LABEL: mul_8xi16:
-; X64-AVX1: # %bb.0: # %entry
-; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT: vmovdqu %xmm0, 16(%rax,%rdx,4)
-; X64-AVX1-NEXT: vmovdqu %xmm1, (%rax,%rdx,4)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: mul_8xi16:
-; X64-AVX2: # %bb.0: # %entry
-; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
-; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4)
-; X64-AVX2-NEXT: vzeroupper
-; X64-AVX2-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <8 x i16>*
- %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
- %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <8 x i16>*
- %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
- %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
- %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
- store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val1 = load <16 x i16>
-; %op1 = zext<16 x i32> %val1
-; %val2 = load <16 x i16>
-; %op2 = zext<16 x i32> %val2
-; %rst = mul <16 x i32> %op1, %op2
-;
-define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_16xi16:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
-; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1
-; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2
-; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3
-; X86-SSE-NEXT: movdqa %xmm2, %xmm4
-; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4
-; X86-SSE-NEXT: pmullw %xmm0, %xmm2
-; X86-SSE-NEXT: movdqa %xmm2, %xmm0
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X86-SSE-NEXT: movdqa %xmm3, %xmm4
-; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4
-; X86-SSE-NEXT: pmullw %xmm1, %xmm3
-; X86-SSE-NEXT: movdqa %xmm3, %xmm1
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4)
-; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
-; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
-; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX1-LABEL: mul_16xi16:
-; X86-AVX1: # %bb.0: # %entry
-; X86-AVX1-NEXT: pushl %esi
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT: movl c, %esi
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4)
-; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4)
-; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4)
-; X86-AVX1-NEXT: popl %esi
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX2-LABEL: mul_16xi16:
-; X86-AVX2: # %bb.0: # %entry
-; X86-AVX2-NEXT: pushl %esi
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT: movl c, %esi
-; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
-; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
-; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
-; X86-AVX2-NEXT: popl %esi
-; X86-AVX2-NEXT: vzeroupper
-; X86-AVX2-NEXT: retl
-;
-; X64-SSE-LABEL: mul_16xi16:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
-; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
-; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
-; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
-; X64-SSE-NEXT: movdqa %xmm2, %xmm4
-; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4
-; X64-SSE-NEXT: pmullw %xmm0, %xmm2
-; X64-SSE-NEXT: movdqa %xmm2, %xmm0
-; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X64-SSE-NEXT: movdqa %xmm3, %xmm4
-; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4
-; X64-SSE-NEXT: pmullw %xmm1, %xmm3
-; X64-SSE-NEXT: movdqa %xmm3, %xmm1
-; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4)
-; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
-; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
-; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX1-LABEL: mul_16xi16:
-; X64-AVX1: # %bb.0: # %entry
-; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
-; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4)
-; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4)
-; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4)
-; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: mul_16xi16:
-; X64-AVX2: # %bb.0: # %entry
-; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
-; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
-; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
-; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
-; X64-AVX2-NEXT: vzeroupper
-; X64-AVX2-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
- %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
- %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
- %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
- %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
- %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
- store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val1 = load <2 x i8>
-; %op1 = sext<2 x i32> %val1
-; %val2 = load <2 x i8>
-; %op2 = sext<2 x i32> %val2
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_2xi8_sext:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
-; X86-SSE-NEXT: movd %edx, %xmm0
-; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
-; X86-SSE-NEXT: movd %eax, %xmm1
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT: psraw $8, %xmm0
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT: psraw $8, %xmm1
-; X86-SSE-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT: psrad $16, %xmm0
-; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi8_sext:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
-; X86-AVX-NEXT: vmovd %edx, %xmm0
-; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
-; X86-AVX-NEXT: vmovd %eax, %xmm1
-; X86-AVX-NEXT: vpmovsxbd %xmm1, %xmm1
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi8_sext:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm1
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT: psraw $8, %xmm0
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT: psraw $8, %xmm1
-; X64-SSE-NEXT: pmullw %xmm0, %xmm1
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT: psrad $16, %xmm0
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi8_sext:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm1
-; X64-AVX-NEXT: vpmovsxbd %xmm1, %xmm1
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
- %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
- %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
- %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
- %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val1 = load <2 x i8>
-; %op1 = sext<2 x i32> %val1
-; %val2 = load <2 x i8>
-; %op2 = zext<2 x i32> %val2
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_2xi8_sext_zext:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx
-; X86-SSE-NEXT: movd %edx, %xmm0
-; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax
-; X86-SSE-NEXT: movd %eax, %xmm1
-; X86-SSE-NEXT: pxor %xmm2, %xmm2
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT: psraw $8, %xmm0
-; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
-; X86-SSE-NEXT: pmullw %xmm1, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi8_sext_zext:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: movzwl (%edx,%ecx), %edx
-; X86-AVX-NEXT: vmovd %edx, %xmm0
-; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT: movzwl (%eax,%ecx), %eax
-; X86-AVX-NEXT: vmovd %eax, %xmm1
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi8_sext_zext:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm1
-; X64-SSE-NEXT: pxor %xmm2, %xmm2
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT: psraw $8, %xmm0
-; X64-SSE-NEXT: movdqa %xmm1, %xmm2
-; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
-; X64-SSE-NEXT: pmullw %xmm1, %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi8_sext_zext:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rdx), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT: movzwl (%rsi,%rdx), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm1
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
- %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
- %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
- %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
- %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val1 = load <2 x i16>
-; %op1 = sext<2 x i32> %val1
-; %val2 = load <2 x i16>
-; %op2 = sext<2 x i32> %val2
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_2xi16_sext:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movdqa %xmm1, %xmm2
-; X86-SSE-NEXT: pmulhw %xmm0, %xmm2
-; X86-SSE-NEXT: pmullw %xmm0, %xmm1
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi16_sext:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovsxwd %xmm1, %xmm1
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi16_sext:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movdqa %xmm1, %xmm2
-; X64-SSE-NEXT: pmulhw %xmm0, %xmm2
-; X64-SSE-NEXT: pmullw %xmm0, %xmm1
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi16_sext:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovsxwd %xmm1, %xmm1
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
- %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
- %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
- %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
- %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val1 = load <2 x i16>
-; %op1 = sext<2 x i32> %val1
-; %val2 = load <2 x i16>
-; %op2 = zext<2 x i32> %val2
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_2xi16_sext_zext:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X86-SSE-NEXT: psrad $16, %xmm0
-; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pxor %xmm2, %xmm2
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm0, %xmm1
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm2, %xmm0
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi16_sext_zext:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: movl c, %esi
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4)
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi16_sext_zext:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X64-SSE-NEXT: psrad $16, %xmm0
-; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: pxor %xmm2, %xmm2
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm0, %xmm1
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm2, %xmm0
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi16_sext_zext:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
- %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
- %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
- %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
- %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val1 = load <16 x i16>
-; %op1 = sext<16 x i32> %val1
-; %val2 = load <16 x i16>
-; %op2 = sext<16 x i32> %val2
-; %rst = mul <16 x i32> %op1, %op2
-;
-define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) nounwind {
-; X86-SSE-LABEL: mul_16xi16_sext:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-SSE-NEXT: movl c, %esi
-; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0
-; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1
-; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2
-; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3
-; X86-SSE-NEXT: movdqa %xmm2, %xmm4
-; X86-SSE-NEXT: pmulhw %xmm0, %xmm4
-; X86-SSE-NEXT: pmullw %xmm0, %xmm2
-; X86-SSE-NEXT: movdqa %xmm2, %xmm0
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X86-SSE-NEXT: movdqa %xmm3, %xmm4
-; X86-SSE-NEXT: pmulhw %xmm1, %xmm4
-; X86-SSE-NEXT: pmullw %xmm1, %xmm3
-; X86-SSE-NEXT: movdqa %xmm3, %xmm1
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4)
-; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4)
-; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4)
-; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX1-LABEL: mul_16xi16_sext:
-; X86-AVX1: # %bb.0: # %entry
-; X86-AVX1-NEXT: pushl %esi
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX1-NEXT: movl c, %esi
-; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm0
-; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm1
-; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm2
-; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm3
-; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4
-; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
-; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4
-; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4
-; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
-; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4
-; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT: vmovdqu %xmm0, 48(%esi,%ecx,4)
-; X86-AVX1-NEXT: vmovdqu %xmm1, 32(%esi,%ecx,4)
-; X86-AVX1-NEXT: vmovdqu %xmm2, 16(%esi,%ecx,4)
-; X86-AVX1-NEXT: vmovdqu %xmm3, (%esi,%ecx,4)
-; X86-AVX1-NEXT: popl %esi
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX2-LABEL: mul_16xi16_sext:
-; X86-AVX2: # %bb.0: # %entry
-; X86-AVX2-NEXT: pushl %esi
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX2-NEXT: movl c, %esi
-; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0
-; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1
-; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2
-; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2
-; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
-; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4)
-; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4)
-; X86-AVX2-NEXT: popl %esi
-; X86-AVX2-NEXT: vzeroupper
-; X86-AVX2-NEXT: retl
-;
-; X64-SSE-LABEL: mul_16xi16_sext:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0
-; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1
-; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2
-; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3
-; X64-SSE-NEXT: movdqa %xmm2, %xmm4
-; X64-SSE-NEXT: pmulhw %xmm0, %xmm4
-; X64-SSE-NEXT: pmullw %xmm0, %xmm2
-; X64-SSE-NEXT: movdqa %xmm2, %xmm0
-; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X64-SSE-NEXT: movdqa %xmm3, %xmm4
-; X64-SSE-NEXT: pmulhw %xmm1, %xmm4
-; X64-SSE-NEXT: pmullw %xmm1, %xmm3
-; X64-SSE-NEXT: movdqa %xmm3, %xmm1
-; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4)
-; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4)
-; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4)
-; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX1-LABEL: mul_16xi16_sext:
-; X64-AVX1: # %bb.0: # %entry
-; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm0
-; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm1
-; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm2
-; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm3
-; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4
-; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0
-; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4
-; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1
-; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4
-; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
-; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4
-; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3
-; X64-AVX1-NEXT: vmovdqu %xmm0, 48(%rax,%rdx,4)
-; X64-AVX1-NEXT: vmovdqu %xmm1, 32(%rax,%rdx,4)
-; X64-AVX1-NEXT: vmovdqu %xmm2, 16(%rax,%rdx,4)
-; X64-AVX1-NEXT: vmovdqu %xmm3, (%rax,%rdx,4)
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: mul_16xi16_sext:
-; X64-AVX2: # %bb.0: # %entry
-; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0
-; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1
-; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2
-; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0
-; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2
-; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
-; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4)
-; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4)
-; X64-AVX2-NEXT: vzeroupper
-; X64-AVX2-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
- %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
- %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
- %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
- %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
- %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
- %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
- %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
- store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val = load <2 x i8>
-; %op1 = zext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi8_varconst1:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl c, %edx
-; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT: movd %ecx, %xmm0
-; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
-; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi8_varconst1:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT: vmovd %ecx, %xmm0
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi8_varconst1:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: pxor %xmm1, %xmm1
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi8_varconst1:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
- %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
- %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val = load <2 x i8>
-; %op1 = sext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi8_varconst2:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl c, %edx
-; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT: movd %ecx, %xmm0
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT: psraw $8, %xmm0
-; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X86-SSE-NEXT: psrad $16, %xmm0
-; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi8_varconst2:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT: vmovd %ecx, %xmm0
-; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi8_varconst2:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT: psraw $8, %xmm0
-; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X64-SSE-NEXT: psrad $16, %xmm0
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi8_varconst2:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
- %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
- %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val = load <2 x i8>
-; %op1 = zext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi8_varconst3:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl c, %edx
-; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT: movd %ecx, %xmm0
-; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm0
-; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi8_varconst3:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT: vmovd %ecx, %xmm0
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi8_varconst3:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: pxor %xmm1, %xmm1
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm0
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi8_varconst3:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-AVX-NEXT: vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
- %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
- %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val = load <2 x i8>
-; %op1 = zext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi8_varconst4:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl c, %edx
-; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT: movd %ecx, %xmm0
-; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
-; X86-SSE-NEXT: pmullw %xmm1, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi8_varconst4:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT: vmovd %ecx, %xmm0
-; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi8_varconst4:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: pxor %xmm1, %xmm1
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
-; X64-SSE-NEXT: movdqa %xmm0, %xmm2
-; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
-; X64-SSE-NEXT: pmullw %xmm1, %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi8_varconst4:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
- %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
- %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val = load <2 x i8>
-; %op1 = sext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi8_varconst5:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl c, %edx
-; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT: movd %ecx, %xmm0
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT: psraw $8, %xmm0
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
-; X86-SSE-NEXT: pmullw %xmm1, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi8_varconst5:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT: vmovd %ecx, %xmm0
-; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi8_varconst5:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT: psraw $8, %xmm0
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
-; X64-SSE-NEXT: movdqa %xmm0, %xmm2
-; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
-; X64-SSE-NEXT: pmullw %xmm1, %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi8_varconst5:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
- %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
- %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val = load <2 x i8>
-; %op1 = sext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi8_varconst6:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl c, %edx
-; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-SSE-NEXT: movd %ecx, %xmm0
-; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X86-SSE-NEXT: psraw $8, %xmm0
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
-; X86-SSE-NEXT: pmullw %xmm1, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi8_varconst6:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: movzwl (%ecx,%eax), %ecx
-; X86-AVX-NEXT: vmovd %ecx, %xmm0
-; X86-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi8_varconst6:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-SSE-NEXT: movd %ecx, %xmm0
-; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X64-SSE-NEXT: psraw $8, %xmm0
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
-; X64-SSE-NEXT: movdqa %xmm0, %xmm2
-; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
-; X64-SSE-NEXT: pmullw %xmm1, %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi8_varconst6:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: movzwl (%rdi,%rsi), %ecx
-; X64-AVX-NEXT: vmovd %ecx, %xmm0
-; X64-AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
- %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
- %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val = load <2 x i16>
-; %op1 = zext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi16_varconst1:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl c, %edx
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2
-; X86-SSE-NEXT: pmullw %xmm1, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi16_varconst1:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi16_varconst1:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
-; X64-SSE-NEXT: movdqa %xmm0, %xmm2
-; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2
-; X64-SSE-NEXT: pmullw %xmm1, %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi16_varconst1:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
- %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
- %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val = load <2 x i16>
-; %op1 = sext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi16_varconst2:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl c, %edx
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
-; X86-SSE-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE-NEXT: pmulhw %xmm1, %xmm2
-; X86-SSE-NEXT: pmullw %xmm1, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi16_varconst2:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi16_varconst2:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
-; X64-SSE-NEXT: movdqa %xmm0, %xmm2
-; X64-SSE-NEXT: pmulhw %xmm1, %xmm2
-; X64-SSE-NEXT: pmullw %xmm1, %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi16_varconst2:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
- %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
- %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val = load <2 x i16>
-; %op1 = zext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi16_varconst3:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl c, %edx
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pxor %xmm1, %xmm1
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u>
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi16_varconst3:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi16_varconst3:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: pxor %xmm1, %xmm1
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65536,u,u>
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi16_varconst3:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
- %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
- %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-; %val = load <2 x i16>
-; %op1 = sext<2 x i32> %val
-; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
-; %rst = mul <2 x i32> %op1, %op2
-;
-define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
-; X86-SSE-LABEL: mul_2xi16_varconst4:
-; X86-SSE: # %bb.0: # %entry
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl c, %edx
-; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X86-SSE-NEXT: psrad $16, %xmm0
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u>
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4)
-; X86-SSE-NEXT: retl
-;
-; X86-AVX-LABEL: mul_2xi16_varconst4:
-; X86-AVX: # %bb.0: # %entry
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl c, %edx
-; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
-; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4)
-; X86-AVX-NEXT: retl
-;
-; X64-SSE-LABEL: mul_2xi16_varconst4:
-; X64-SSE: # %bb.0: # %entry
-; X64-SSE-NEXT: movq {{.*}}(%rip), %rax
-; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X64-SSE-NEXT: psrad $16, %xmm0
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,32768,u,u>
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X64-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX-LABEL: mul_2xi16_varconst4:
-; X64-AVX: # %bb.0: # %entry
-; X64-AVX-NEXT: movq {{.*}}(%rip), %rax
-; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4)
-; X64-AVX-NEXT: retq
-entry:
- %pre = load i32*, i32** @c
- %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
- %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
- %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
- %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
- %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
- %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
- %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
- store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
- ret void
-}
-
-;
-; Illegal Types
-;
-
-define void @PR34947(<9 x i16>* %p0, <9 x i32>* %p1) nounwind {
-; X86-SSE-LABEL: PR34947:
-; X86-SSE: # %bb.0:
-; X86-SSE-NEXT: pushl %esi
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT: movdqa (%eax), %xmm5
-; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT: movdqa (%ecx), %xmm2
-; X86-SSE-NEXT: movdqa 16(%ecx), %xmm6
-; X86-SSE-NEXT: pxor %xmm0, %xmm0
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X86-SSE-NEXT: movdqa %xmm5, %xmm4
-; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
-; X86-SSE-NEXT: movd %xmm0, %eax
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
-; X86-SSE-NEXT: movd %xmm0, %esi
-; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl %esi
-; X86-SSE-NEXT: movd %edx, %xmm0
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
-; X86-SSE-NEXT: movd %xmm3, %eax
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[2,3,0,1]
-; X86-SSE-NEXT: movd %xmm3, %esi
-; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl %esi
-; X86-SSE-NEXT: movd %edx, %xmm7
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; X86-SSE-NEXT: movd %xmm5, %eax
-; X86-SSE-NEXT: movd %xmm6, %esi
-; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl %esi
-; X86-SSE-NEXT: movd %edx, %xmm3
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
-; X86-SSE-NEXT: movd %xmm5, %eax
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
-; X86-SSE-NEXT: movd %xmm5, %esi
-; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl %esi
-; X86-SSE-NEXT: movd %edx, %xmm5
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
-; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm7[0]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[3,1,2,3]
-; X86-SSE-NEXT: movd %xmm6, %eax
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
-; X86-SSE-NEXT: movd %xmm6, %esi
-; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl %esi
-; X86-SSE-NEXT: movd %edx, %xmm6
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1]
-; X86-SSE-NEXT: movd %xmm7, %eax
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
-; X86-SSE-NEXT: movd %xmm7, %esi
-; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl %esi
-; X86-SSE-NEXT: movd %edx, %xmm7
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; X86-SSE-NEXT: movd %xmm4, %eax
-; X86-SSE-NEXT: movd %xmm2, %esi
-; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl %esi
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,2,3]
-; X86-SSE-NEXT: movd %xmm4, %eax
-; X86-SSE-NEXT: movd %edx, %xmm4
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; X86-SSE-NEXT: movd %xmm2, %esi
-; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl %esi
-; X86-SSE-NEXT: movd %edx, %xmm2
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
-; X86-SSE-NEXT: movd %xmm1, %eax
-; X86-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
-; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm4
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm2
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; X86-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm0[0,0]
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm3
-; X86-SSE-NEXT: pmuludq %xmm1, %xmm5
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
-; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE-NEXT: xorl %edx, %edx
-; X86-SSE-NEXT: divl 32(%ecx)
-; X86-SSE-NEXT: movdqa %xmm0, (%eax)
-; X86-SSE-NEXT: movdqa %xmm4, (%eax)
-; X86-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
-; X86-SSE-NEXT: movl %eax, (%eax)
-; X86-SSE-NEXT: popl %esi
-; X86-SSE-NEXT: retl
-;
-; X86-AVX1-LABEL: PR34947:
-; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: pushl %ebp
-; X86-AVX1-NEXT: pushl %ebx
-; X86-AVX1-NEXT: pushl %edi
-; X86-AVX1-NEXT: pushl %esi
-; X86-AVX1-NEXT: subl $16, %esp
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X86-AVX1-NEXT: vmovd %xmm1, %eax
-; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl 32(%ecx)
-; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT: vpextrd $3, %xmm2, %eax
-; X86-AVX1-NEXT: vmovdqa (%ecx), %xmm1
-; X86-AVX1-NEXT: vmovdqa 16(%ecx), %xmm3
-; X86-AVX1-NEXT: vpextrd $3, %xmm3, %ecx
-; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl %ecx
-; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT: vpextrd $2, %xmm2, %eax
-; X86-AVX1-NEXT: vpextrd $2, %xmm3, %ecx
-; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl %ecx
-; X86-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-AVX1-NEXT: vpextrd $1, %xmm2, %eax
-; X86-AVX1-NEXT: vpextrd $1, %xmm3, %ecx
-; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl %ecx
-; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-AVX1-NEXT: vmovd %xmm2, %eax
-; X86-AVX1-NEXT: vmovd %xmm3, %ecx
-; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl %ecx
-; X86-AVX1-NEXT: movl %edx, %ebp
-; X86-AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; X86-AVX1-NEXT: vpextrd $3, %xmm1, %ecx
-; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl %ecx
-; X86-AVX1-NEXT: movl %edx, %ebx
-; X86-AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; X86-AVX1-NEXT: vpextrd $2, %xmm1, %esi
-; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl %esi
-; X86-AVX1-NEXT: movl %edx, %esi
-; X86-AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; X86-AVX1-NEXT: vpextrd $1, %xmm1, %edi
-; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl %edi
-; X86-AVX1-NEXT: movl %edx, %edi
-; X86-AVX1-NEXT: vmovd %xmm0, %eax
-; X86-AVX1-NEXT: vmovd %xmm1, %ecx
-; X86-AVX1-NEXT: xorl %edx, %edx
-; X86-AVX1-NEXT: divl %ecx
-; X86-AVX1-NEXT: vmovd %edx, %xmm0
-; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpinsrd $3, %ebx, %xmm0, %xmm0
-; X86-AVX1-NEXT: vmovd %ebp, %xmm1
-; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT: vpinsrd $2, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT: vpinsrd $3, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 4-byte Folded Reload
-; X86-AVX1-NEXT: imull $8199, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-AVX1-NEXT: # imm = 0x2007
-; X86-AVX1-NEXT: movl %eax, (%eax)
-; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [8199,8199,8199,8199]
-; X86-AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
-; X86-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
-; X86-AVX1-NEXT: vmovdqa %xmm1, (%eax)
-; X86-AVX1-NEXT: vmovdqa %xmm0, (%eax)
-; X86-AVX1-NEXT: addl $16, %esp
-; X86-AVX1-NEXT: popl %esi
-; X86-AVX1-NEXT: popl %edi
-; X86-AVX1-NEXT: popl %ebx
-; X86-AVX1-NEXT: popl %ebp
-; X86-AVX1-NEXT: retl
-;
-; X86-AVX2-LABEL: PR34947:
-; X86-AVX2: # %bb.0:
-; X86-AVX2-NEXT: pushl %edi
-; X86-AVX2-NEXT: pushl %esi
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X86-AVX2-NEXT: vmovdqa (%esi), %xmm2
-; X86-AVX2-NEXT: vmovdqa 16(%esi), %xmm3
-; X86-AVX2-NEXT: vpextrd $1, %xmm3, %ecx
-; X86-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; X86-AVX2-NEXT: vpextrd $1, %xmm4, %eax
-; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl %ecx
-; X86-AVX2-NEXT: movl %edx, %ecx
-; X86-AVX2-NEXT: vmovd %xmm3, %edi
-; X86-AVX2-NEXT: vmovd %xmm4, %eax
-; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl %edi
-; X86-AVX2-NEXT: vmovd %edx, %xmm5
-; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5
-; X86-AVX2-NEXT: vpextrd $2, %xmm3, %ecx
-; X86-AVX2-NEXT: vpextrd $2, %xmm4, %eax
-; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl %ecx
-; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; X86-AVX2-NEXT: vpextrd $3, %xmm3, %ecx
-; X86-AVX2-NEXT: vpextrd $3, %xmm4, %eax
-; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl %ecx
-; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
-; X86-AVX2-NEXT: vpextrd $1, %xmm2, %ecx
-; X86-AVX2-NEXT: vpextrd $1, %xmm1, %eax
-; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl %ecx
-; X86-AVX2-NEXT: movl %edx, %ecx
-; X86-AVX2-NEXT: vmovd %xmm2, %edi
-; X86-AVX2-NEXT: vmovd %xmm1, %eax
-; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl %edi
-; X86-AVX2-NEXT: vmovd %edx, %xmm4
-; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4
-; X86-AVX2-NEXT: vpextrd $2, %xmm2, %ecx
-; X86-AVX2-NEXT: vpextrd $2, %xmm1, %eax
-; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl %ecx
-; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; X86-AVX2-NEXT: vpextrd $3, %xmm2, %ecx
-; X86-AVX2-NEXT: vpextrd $3, %xmm1, %eax
-; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl %ecx
-; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
-; X86-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; X86-AVX2-NEXT: vmovd %xmm0, %eax
-; X86-AVX2-NEXT: xorl %edx, %edx
-; X86-AVX2-NEXT: divl 32(%esi)
-; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
-; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
-; X86-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007
-; X86-AVX2-NEXT: movl %eax, (%eax)
-; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax)
-; X86-AVX2-NEXT: popl %esi
-; X86-AVX2-NEXT: popl %edi
-; X86-AVX2-NEXT: vzeroupper
-; X86-AVX2-NEXT: retl
-;
-; X64-SSE-LABEL: PR34947:
-; X64-SSE: # %bb.0:
-; X64-SSE-NEXT: movdqa (%rdi), %xmm5
-; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-SSE-NEXT: movdqa (%rsi), %xmm2
-; X64-SSE-NEXT: movdqa 16(%rsi), %xmm6
-; X64-SSE-NEXT: pxor %xmm0, %xmm0
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X64-SSE-NEXT: movdqa %xmm5, %xmm3
-; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
-; X64-SSE-NEXT: movd %xmm0, %eax
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,1,2,3]
-; X64-SSE-NEXT: movd %xmm0, %ecx
-; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl %ecx
-; X64-SSE-NEXT: movd %edx, %xmm8
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1]
-; X64-SSE-NEXT: movd %xmm4, %eax
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[2,3,0,1]
-; X64-SSE-NEXT: movd %xmm4, %ecx
-; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl %ecx
-; X64-SSE-NEXT: movd %edx, %xmm7
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
-; X64-SSE-NEXT: movd %xmm5, %eax
-; X64-SSE-NEXT: movd %xmm6, %ecx
-; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl %ecx
-; X64-SSE-NEXT: movd %edx, %xmm4
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
-; X64-SSE-NEXT: movd %xmm5, %eax
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,2,3]
-; X64-SSE-NEXT: movd %xmm5, %ecx
-; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl %ecx
-; X64-SSE-NEXT: movd %edx, %xmm5
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm7[0]
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm3[3,1,2,3]
-; X64-SSE-NEXT: movd %xmm6, %eax
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[3,1,2,3]
-; X64-SSE-NEXT: movd %xmm6, %ecx
-; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl %ecx
-; X64-SSE-NEXT: movd %edx, %xmm6
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[2,3,0,1]
-; X64-SSE-NEXT: movd %xmm7, %eax
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1]
-; X64-SSE-NEXT: movd %xmm7, %ecx
-; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl %ecx
-; X64-SSE-NEXT: movd %edx, %xmm7
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; X64-SSE-NEXT: movd %xmm3, %eax
-; X64-SSE-NEXT: movd %xmm2, %ecx
-; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl %ecx
-; X64-SSE-NEXT: movd %edx, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; X64-SSE-NEXT: movd %xmm3, %eax
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; X64-SSE-NEXT: movd %xmm2, %ecx
-; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl %ecx
-; X64-SSE-NEXT: movd %edx, %xmm2
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
-; X64-SSE-NEXT: movd %xmm1, %eax
-; X64-SSE-NEXT: xorl %edx, %edx
-; X64-SSE-NEXT: divl 32(%rsi)
-; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm6[0,0]
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm2
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm4
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
-; X64-SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[0,0]
-; X64-SSE-NEXT: pmuludq %xmm1, %xmm5
-; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
-; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64-SSE-NEXT: imull $8199, %edx, %eax # imm = 0x2007
-; X64-SSE-NEXT: movl %eax, (%rax)
-; X64-SSE-NEXT: movdqa %xmm2, (%rax)
-; X64-SSE-NEXT: movdqa %xmm0, (%rax)
-; X64-SSE-NEXT: retq
-;
-; X64-AVX1-LABEL: PR34947:
-; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: pushq %rbp
-; X64-AVX1-NEXT: pushq %rbx
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; X64-AVX1-NEXT: vmovd %xmm1, %eax
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl 32(%rsi)
-; X64-AVX1-NEXT: movl %edx, %r8d
-; X64-AVX1-NEXT: vpextrd $3, %xmm2, %eax
-; X64-AVX1-NEXT: vmovdqa (%rsi), %xmm1
-; X64-AVX1-NEXT: vmovdqa 16(%rsi), %xmm3
-; X64-AVX1-NEXT: vpextrd $3, %xmm3, %ecx
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl %ecx
-; X64-AVX1-NEXT: movl %edx, %r9d
-; X64-AVX1-NEXT: vpextrd $2, %xmm2, %eax
-; X64-AVX1-NEXT: vpextrd $2, %xmm3, %ecx
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl %ecx
-; X64-AVX1-NEXT: movl %edx, %r10d
-; X64-AVX1-NEXT: vpextrd $1, %xmm2, %eax
-; X64-AVX1-NEXT: vpextrd $1, %xmm3, %ecx
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl %ecx
-; X64-AVX1-NEXT: movl %edx, %r11d
-; X64-AVX1-NEXT: vmovd %xmm2, %eax
-; X64-AVX1-NEXT: vmovd %xmm3, %ecx
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl %ecx
-; X64-AVX1-NEXT: movl %edx, %esi
-; X64-AVX1-NEXT: vpextrd $3, %xmm0, %eax
-; X64-AVX1-NEXT: vpextrd $3, %xmm1, %ecx
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl %ecx
-; X64-AVX1-NEXT: movl %edx, %edi
-; X64-AVX1-NEXT: vpextrd $2, %xmm0, %eax
-; X64-AVX1-NEXT: vpextrd $2, %xmm1, %ecx
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl %ecx
-; X64-AVX1-NEXT: movl %edx, %ecx
-; X64-AVX1-NEXT: vpextrd $1, %xmm0, %eax
-; X64-AVX1-NEXT: vpextrd $1, %xmm1, %ebx
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl %ebx
-; X64-AVX1-NEXT: movl %edx, %ebx
-; X64-AVX1-NEXT: vmovd %xmm0, %eax
-; X64-AVX1-NEXT: vmovd %xmm1, %ebp
-; X64-AVX1-NEXT: xorl %edx, %edx
-; X64-AVX1-NEXT: divl %ebp
-; X64-AVX1-NEXT: vmovd %edx, %xmm0
-; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0
-; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
-; X64-AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; X64-AVX1-NEXT: vmovd %esi, %xmm2
-; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2
-; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2
-; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2
-; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT: imull $8199, %r8d, %eax # imm = 0x2007
-; X64-AVX1-NEXT: movl %eax, (%rax)
-; X64-AVX1-NEXT: vmovdqa %xmm1, (%rax)
-; X64-AVX1-NEXT: vmovdqa %xmm0, (%rax)
-; X64-AVX1-NEXT: popq %rbx
-; X64-AVX1-NEXT: popq %rbp
-; X64-AVX1-NEXT: retq
-;
-; X64-AVX2-LABEL: PR34947:
-; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; X64-AVX2-NEXT: vmovdqa (%rsi), %xmm2
-; X64-AVX2-NEXT: vmovdqa 16(%rsi), %xmm3
-; X64-AVX2-NEXT: vpextrd $1, %xmm3, %ecx
-; X64-AVX2-NEXT: vextracti128 $1, %ymm1, %xmm4
-; X64-AVX2-NEXT: vpextrd $1, %xmm4, %eax
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl %ecx
-; X64-AVX2-NEXT: movl %edx, %ecx
-; X64-AVX2-NEXT: vmovd %xmm3, %edi
-; X64-AVX2-NEXT: vmovd %xmm4, %eax
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl %edi
-; X64-AVX2-NEXT: vmovd %edx, %xmm5
-; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm5, %xmm5
-; X64-AVX2-NEXT: vpextrd $2, %xmm3, %ecx
-; X64-AVX2-NEXT: vpextrd $2, %xmm4, %eax
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl %ecx
-; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm5, %xmm5
-; X64-AVX2-NEXT: vpextrd $3, %xmm3, %ecx
-; X64-AVX2-NEXT: vpextrd $3, %xmm4, %eax
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl %ecx
-; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm5, %xmm3
-; X64-AVX2-NEXT: vpextrd $1, %xmm2, %ecx
-; X64-AVX2-NEXT: vpextrd $1, %xmm1, %eax
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl %ecx
-; X64-AVX2-NEXT: movl %edx, %ecx
-; X64-AVX2-NEXT: vmovd %xmm2, %edi
-; X64-AVX2-NEXT: vmovd %xmm1, %eax
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl %edi
-; X64-AVX2-NEXT: vmovd %edx, %xmm4
-; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm4, %xmm4
-; X64-AVX2-NEXT: vpextrd $2, %xmm2, %ecx
-; X64-AVX2-NEXT: vpextrd $2, %xmm1, %eax
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl %ecx
-; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm4, %xmm4
-; X64-AVX2-NEXT: vpextrd $3, %xmm2, %ecx
-; X64-AVX2-NEXT: vpextrd $3, %xmm1, %eax
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl %ecx
-; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm4, %xmm1
-; X64-AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
-; X64-AVX2-NEXT: vmovd %xmm0, %eax
-; X64-AVX2-NEXT: xorl %edx, %edx
-; X64-AVX2-NEXT: divl 32(%rsi)
-; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm0 = [8199,8199,8199,8199,8199,8199,8199,8199]
-; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0
-; X64-AVX2-NEXT: imull $8199, %edx, %eax # imm = 0x2007
-; X64-AVX2-NEXT: movl %eax, (%rax)
-; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax)
-; X64-AVX2-NEXT: vzeroupper
-; X64-AVX2-NEXT: retq
- %a0 = load <9 x i16>, <9 x i16>* %p0, align 64
- %a1 = load <9 x i32>, <9 x i32>* %p1, align 64
- %ext0 = zext <9 x i16> %a0 to <9 x i32>
- %rem = urem <9 x i32> %ext0, %a1
- %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
- store <9 x i32> %mul, <9 x i32>* undef, align 64
- ret void
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE42
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
-
-; PR31551
-; Pairs of shufflevector:trunc functions with functional equivalence.
-; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
-
-define void @shuffle_v16i8_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
-; SSE2-LABEL: shuffle_v16i8_to_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movq %xmm0, (%rsi)
-; SSE2-NEXT: retq
-;
-; SSE42-LABEL: shuffle_v16i8_to_v8i8:
-; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE42-NEXT: movq %xmm0, (%rsi)
-; SSE42-NEXT: retq
-;
-; AVX-LABEL: shuffle_v16i8_to_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: shuffle_v16i8_to_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
- %vec = load <16 x i8>, <16 x i8>* %L
- %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
- store <8 x i8> %strided.vec, <8 x i8>* %S
- ret void
-}
-
-define void @trunc_v8i16_to_v8i8(<16 x i8>* %L, <8 x i8>* %S) nounwind {
-; SSE2-LABEL: trunc_v8i16_to_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movq %xmm0, (%rsi)
-; SSE2-NEXT: retq
-;
-; SSE42-LABEL: trunc_v8i16_to_v8i8:
-; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE42-NEXT: movq %xmm0, (%rsi)
-; SSE42-NEXT: retq
-;
-; AVX-LABEL: trunc_v8i16_to_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v8i16_to_v8i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v8i16_to_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v8i16_to_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v8i16_to_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovwb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
- %vec = load <16 x i8>, <16 x i8>* %L
- %bc = bitcast <16 x i8> %vec to <8 x i16>
- %strided.vec = trunc <8 x i16> %bc to <8 x i8>
- store <8 x i8> %strided.vec, <8 x i8>* %S
- ret void
-}
-
-define void @shuffle_v8i16_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
-; SSE2-LABEL: shuffle_v8i16_to_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: movq %xmm0, (%rsi)
-; SSE2-NEXT: retq
-;
-; SSE42-LABEL: shuffle_v8i16_to_v4i16:
-; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE42-NEXT: movq %xmm0, (%rsi)
-; SSE42-NEXT: retq
-;
-; AVX-LABEL: shuffle_v8i16_to_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: shuffle_v8i16_to_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT: vmovq %xmm0, (%rsi)
-; AVX512-NEXT: retq
- %vec = load <8 x i16>, <8 x i16>* %L
- %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
- store <4 x i16> %strided.vec, <4 x i16>* %S
- ret void
-}
-
-define void @trunc_v4i32_to_v4i16(<8 x i16>* %L, <4 x i16>* %S) nounwind {
-; SSE2-LABEL: trunc_v4i32_to_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: movq %xmm0, (%rsi)
-; SSE2-NEXT: retq
-;
-; SSE42-LABEL: trunc_v4i32_to_v4i16:
-; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE42-NEXT: movq %xmm0, (%rsi)
-; SSE42-NEXT: retq
-;
-; AVX-LABEL: trunc_v4i32_to_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v4i32_to_v4i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v4i32_to_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovdw %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v4i32_to_v4i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v4i32_to_v4i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovdw %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
- %vec = load <8 x i16>, <8 x i16>* %L
- %bc = bitcast <8 x i16> %vec to <4 x i32>
- %strided.vec = trunc <4 x i32> %bc to <4 x i16>
- store <4 x i16> %strided.vec, <4 x i16>* %S
- ret void
-}
-
-define void @shuffle_v4i32_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
-; SSE-LABEL: shuffle_v4i32_to_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; SSE-NEXT: movq %xmm0, (%rsi)
-; SSE-NEXT: retq
-;
-; AVX-LABEL: shuffle_v4i32_to_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX-NEXT: vmovlps %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: shuffle_v4i32_to_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512-NEXT: retq
- %vec = load <4 x i32>, <4 x i32>* %L
- %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
- store <2 x i32> %strided.vec, <2 x i32>* %S
- ret void
-}
-
-define void @trunc_v2i64_to_v2i32(<4 x i32>* %L, <2 x i32>* %S) nounwind {
-; SSE-LABEL: trunc_v2i64_to_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; SSE-NEXT: movq %xmm0, (%rsi)
-; SSE-NEXT: retq
-;
-; AVX-LABEL: trunc_v2i64_to_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX-NEXT: vmovlps %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v2i64_to_v2i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v2i64_to_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqd %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v2i64_to_v2i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpermilps {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512BW-NEXT: vmovlps %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v2i64_to_v2i32:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqd %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
- %vec = load <4 x i32>, <4 x i32>* %L
- %bc = bitcast <4 x i32> %vec to <2 x i64>
- %strided.vec = trunc <2 x i64> %bc to <2 x i32>
- store <2 x i32> %strided.vec, <2 x i32>* %S
- ret void
-}
-
-define void @shuffle_v16i8_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
-; SSE2-LABEL: shuffle_v16i8_to_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movd %xmm0, (%rsi)
-; SSE2-NEXT: retq
-;
-; SSE42-LABEL: shuffle_v16i8_to_v4i8:
-; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE42-NEXT: movd %xmm0, (%rsi)
-; SSE42-NEXT: retq
-;
-; AVX-LABEL: shuffle_v16i8_to_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: shuffle_v16i8_to_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovd %xmm0, (%rsi)
-; AVX512-NEXT: retq
- %vec = load <16 x i8>, <16 x i8>* %L
- %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
- store <4 x i8> %strided.vec, <4 x i8>* %S
- ret void
-}
-
-define void @trunc_v4i32_to_v4i8(<16 x i8>* %L, <4 x i8>* %S) nounwind {
-; SSE2-LABEL: trunc_v4i32_to_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movd %xmm0, (%rsi)
-; SSE2-NEXT: retq
-;
-; SSE42-LABEL: trunc_v4i32_to_v4i8:
-; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE42-NEXT: movd %xmm0, (%rsi)
-; SSE42-NEXT: retq
-;
-; AVX-LABEL: trunc_v4i32_to_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v4i32_to_v4i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v4i32_to_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v4i32_to_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v4i32_to_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovdb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
- %vec = load <16 x i8>, <16 x i8>* %L
- %bc = bitcast <16 x i8> %vec to <4 x i32>
- %strided.vec = trunc <4 x i32> %bc to <4 x i8>
- store <4 x i8> %strided.vec, <4 x i8>* %S
- ret void
-}
-
-define void @shuffle_v8i16_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
-; SSE-LABEL: shuffle_v8i16_to_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: movd %xmm0, (%rsi)
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: shuffle_v8i16_to_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: shuffle_v8i16_to_v2i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v8i16_to_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v8i16_to_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
- %vec = load <8 x i16>, <8 x i16>* %L
- %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 0, i32 4>
- store <2 x i16> %strided.vec, <2 x i16>* %S
- ret void
-}
-
-define void @trunc_v2i64_to_v2i16(<8 x i16>* %L, <2 x i16>* %S) nounwind {
-; SSE-LABEL: trunc_v2i64_to_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: movd %xmm0, (%rsi)
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_v2i64_to_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vmovd %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_v2i64_to_v2i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_v2i64_to_v2i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vmovd %xmm0, (%rsi)
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v2i64_to_v2i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v2i64_to_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqw %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v2i64_to_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v2i64_to_v2i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqw %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
- %vec = load <8 x i16>, <8 x i16>* %L
- %bc = bitcast <8 x i16> %vec to <2 x i64>
- %strided.vec = trunc <2 x i64> %bc to <2 x i16>
- store <2 x i16> %strided.vec, <2 x i16>* %S
- ret void
-}
-
-define void @shuffle_v16i8_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
-; SSE2-LABEL: shuffle_v16i8_to_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movw %ax, (%rsi)
-; SSE2-NEXT: retq
-;
-; SSE42-LABEL: shuffle_v16i8_to_v2i8:
-; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
-; SSE42-NEXT: retq
-;
-; AVX-LABEL: shuffle_v16i8_to_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: shuffle_v16i8_to_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512-NEXT: retq
- %vec = load <16 x i8>, <16 x i8>* %L
- %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 0, i32 8>
- store <2 x i8> %strided.vec, <2 x i8>* %S
- ret void
-}
-
-define void @trunc_v2i64_to_v2i8(<16 x i8>* %L, <2 x i8>* %S) nounwind {
-; SSE2-LABEL: trunc_v2i64_to_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: movw %ax, (%rsi)
-; SSE2-NEXT: retq
-;
-; SSE42-LABEL: trunc_v2i64_to_v2i8:
-; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa (%rdi), %xmm0
-; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE42-NEXT: pextrw $0, %xmm0, (%rsi)
-; SSE42-NEXT: retq
-;
-; AVX-LABEL: trunc_v2i64_to_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v2i64_to_v2i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v2i64_to_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v2i64_to_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpextrw $0, %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v2i64_to_v2i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vpmovqb %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
- %vec = load <16 x i8>, <16 x i8>* %L
- %bc = bitcast <16 x i8> %vec to <2 x i64>
- %strided.vec = trunc <2 x i64> %bc to <2 x i8>
- store <2 x i8> %strided.vec, <2 x i8>* %S
- ret void
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
-
-; PR31551
-; Pairs of shufflevector:trunc functions with functional equivalence.
-; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
-
-define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v16i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT: retq
- %vec = load <32 x i8>, <32 x i8>* %L
- %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
- store <16 x i8> %strided.vec, <16 x i8>* %S
- ret void
-}
-
-define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX1-LABEL: trunc_v16i16_to_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_v16i16_to_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v16i16_to_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v16i16_to_v16i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vpmovwb %ymm0, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <32 x i8>, <32 x i8>* %L
- %bc = bitcast <32 x i8> %vec to <16 x i16>
- %strided.vec = trunc <16 x i16> %bc to <16 x i8>
- store <16 x i8> %strided.vec, <16 x i8>* %S
- ret void
-}
-
-define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX-LABEL: shuffle_v16i16_to_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
-; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v8i16:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,2,4,6,8,10,12,14]
-; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT: retq
- %vec = load <16 x i16>, <16 x i16>* %L
- %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
- store <8 x i16> %strided.vec, <8 x i16>* %S
- ret void
-}
-
-define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX1-LABEL: trunc_v8i32_to_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v8i32_to_v8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i16:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vpmovdw %ymm0, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <16 x i16>, <16 x i16>* %L
- %bc = bitcast <16 x i16> %vec to <8 x i32>
- %strided.vec = trunc <8 x i32> %bc to <8 x i16>
- store <8 x i16> %strided.vec, <8 x i16>* %S
- ret void
-}
-
-define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
-; AVX-LABEL: shuffle_v8i32_to_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX-NEXT: vmovaps %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: shuffle_v8i32_to_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovaps (%rdi), %xmm0
-; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX512-NEXT: vmovaps %xmm0, (%rsi)
-; AVX512-NEXT: retq
- %vec = load <8 x i32>, <8 x i32>* %L
- %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
- store <4 x i32> %strided.vec, <4 x i32>* %S
- ret void
-}
-
-define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps (%rdi), %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX1-NEXT: vmovaps %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vmovaps (%rdi), %xmm0
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],mem[0,2]
-; AVX2-SLOW-NEXT: vmovaps %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm0 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps (%rdi), %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovaps %xmm0, (%rsi)
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vpmovqd %ymm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpmovqd %ymm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i32:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vpmovqd %ymm0, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <8 x i32>, <8 x i32>* %L
- %bc = bitcast <8 x i32> %vec to <4 x i64>
- %strided.vec = trunc <4 x i64> %bc to <4 x i32>
- store <4 x i32> %strided.vec, <4 x i32>* %S
- ret void
-}
-
-define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2024390091656922112,2024390091656922112]
-; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT: retq
- %vec = load <32 x i8>, <32 x i8>* %L
- %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
- store <8 x i8> %strided.vec, <8 x i8>* %S
- ret void
-}
-
-define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX-LABEL: trunc_v8i32_to_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX-NEXT: vmovq %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v8i32_to_v8i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vpmovdb %ymm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vpmovdb %ymm0, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <32 x i8>, <32 x i8>* %L
- %bc = bitcast <32 x i8> %vec to <8 x i32>
- %strided.vec = trunc <8 x i32> %bc to <8 x i8>
- store <8 x i8> %strided.vec, <8 x i8>* %S
- ret void
-}
-
-define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
-; IR generated from:
-; return (__m128i) {(long long)__builtin_convertvector((__v8si)__A, __v8qi), 0};
-; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %truncated.vec = trunc <8 x i32> %vec to <8 x i8>
- %bc = bitcast <8 x i8> %truncated.vec to i64
- %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
- ret <2 x i64> %result
-}
-
-define <16 x i8> @trunc_v8i32_to_v8i8_with_zext_return_v16i8(<8 x i32> %vec) nounwind {
-; AVX1-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_with_zext_return_v16i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %truncated = trunc <8 x i32> %vec to <8 x i8>
- %truncated.ext = zext <8 x i8> %truncated to <8 x i16>
- %bc = bitcast <8 x i16> %truncated.ext to <16 x i8>
- %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
- ret <16 x i8> %result
-}
-
-define <16 x i8> @trunc_v8i32_to_v8i8_via_v8i16_return_v16i8(<8 x i32> %vec) nounwind {
-; AVX1-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_via_v8i16_return_v16i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %truncated = trunc <8 x i32> %vec to <8 x i16>
- %bc = bitcast <8 x i16> %truncated to <16 x i8>
- %result = shufflevector <16 x i8> %bc, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 17, i32 20, i32 24, i32 22, i32 31, i32 28, i32 28, i32 29>
- ret <16 x i8> %result
-}
-
-define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
-; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %truncated = trunc <8 x i32> %vec to <8 x i8>
- %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- ret <16 x i8> %result
-}
-
-define <2 x i64> @trunc_v4i64_to_v4i16_return_v2i64(<4 x i64> %vec) nounwind {
-; IR generated from:
-; return (__m128i) {(long long)__builtin_convertvector((__v4di)x, __v4hi), 0};
-; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v2i64:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %truncated = trunc <4 x i64> %vec to <4 x i16>
- %bc = bitcast <4 x i16> %truncated to i64
- %result = insertelement <2 x i64> zeroinitializer, i64 %bc, i32 0
- ret <2 x i64> %result
-}
-
-define <8 x i16> @trunc_v4i64_to_v4i16_with_zext_return_v8i16(<4 x i64> %vec) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_with_zext_return_v8i16:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %truncated = trunc <4 x i64> %vec to <4 x i16>
- %truncated.ext = zext <4 x i16> %truncated to <4 x i32>
- %bc = bitcast <4 x i32> %truncated.ext to <8 x i16>
- %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
- ret <8 x i16> %result
-}
-
-define <8 x i16> @trunc_v4i64_to_v4i16_via_v4i32_return_v8i16(<4 x i64> %vec) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_via_v4i32_return_v8i16:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %truncated = trunc <4 x i64> %vec to <4 x i32>
- %bc = bitcast <4 x i32> %truncated to <8 x i16>
- %result = shufflevector <8 x i16> %bc, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 8, i32 undef, i32 13>
- ret <8 x i16> %result
-}
-
-define <8 x i16> @trunc_v4i64_to_v4i16_return_v8i16(<4 x i64> %vec) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-SLOW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16_return_v8i16:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %truncated = trunc <4 x i64> %vec to <4 x i16>
- %result = shufflevector <4 x i16> %truncated, <4 x i16> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x i16> %result
-}
-
-define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0
-; AVX512VBMIVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VBMIVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %truncated = trunc <4 x i64> %vec to <4 x i8>
- %result = shufflevector <4 x i8> %truncated, <4 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7, i32 5, i32 5, i32 undef, i32 7>
- ret <16 x i8> %result
-}
-
-define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
-; AVX1-LABEL: shuffle_v16i16_to_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13]
-; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,4,5,12,13]
-; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT: retq
- %vec = load <16 x i16>, <16 x i16>* %L
- %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
- store <4 x i16> %strided.vec, <4 x i16>* %S
- ret void
-}
-
-define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
-; AVX1-LABEL: trunc_v4i64_to_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-NEXT: vmovq %xmm0, (%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_v4i64_to_v4i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX2-SLOW-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_v4i64_to_v4i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FAST-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT: vmovq %xmm0, (%rsi)
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vpmovqw %ymm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpmovqw %ymm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i16:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vpmovqw %ymm0, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <16 x i16>, <16 x i16>* %L
- %bc = bitcast <16 x i16> %vec to <4 x i64>
- %strided.vec = trunc <4 x i64> %bc to <4 x i16>
- store <4 x i16> %strided.vec, <4 x i16>* %S
- ret void
-}
-
-define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX-LABEL: shuffle_v32i8_to_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMIVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [403703808,403703808,403703808,403703808]
-; AVX512VBMIVL-NEXT: vpermi2b 16(%rdi), %xmm0, %xmm1
-; AVX512VBMIVL-NEXT: vmovd %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT: retq
- %vec = load <32 x i8>, <32 x i8>* %L
- %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
- store <4 x i8> %strided.vec, <4 x i8>* %S
- ret void
-}
-
-define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
-; AVX-LABEL: trunc_v4i64_to_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: vmovd %xmm0, (%rsi)
-; AVX-NEXT: retq
-;
-; AVX512F-LABEL: trunc_v4i64_to_v4i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512F-NEXT: vmovd %xmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vpmovqb %ymm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpmovqb %ymm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v4i64_to_v4i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vpmovqb %ymm0, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <32 x i8>, <32 x i8>* %L
- %bc = bitcast <32 x i8> %vec to <4 x i64>
- %strided.vec = trunc <4 x i64> %bc to <4 x i8>
- store <4 x i8> %strided.vec, <4 x i8>* %S
- ret void
-}
-
-; In this case not all elements are collected from the same source vector, so
-; the resulting BUILD_VECTOR should not be combined to a truncate.
-define <16 x i8> @negative(<32 x i8> %v, <32 x i8> %w) nounwind {
-; AVX1-LABEL: negative:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[u,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u],zero,zero,zero,zero,zero,zero,zero,xmm0[0,2,4,6,8,10,12,14]
-; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: negative:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: negative:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: negative:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: negative:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: negative:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,u,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
-; AVX512BWVL-NEXT: movl $65537, %eax # imm = 0x10001
-; AVX512BWVL-NEXT: kmovd %eax, %k1
-; AVX512BWVL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1}
-; AVX512BWVL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,2,3]
-; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: negative:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,48,18,20,22,24,26,28,30,16,18,20,22,24,26,28,30]
-; AVX512VBMIVL-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
-; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %strided.vec = shufflevector <32 x i8> %v, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
- %w0 = extractelement <32 x i8> %w, i32 0
- %merged = insertelement <16 x i8> %strided.vec, i8 %w0, i32 0
- ret <16 x i8> %merged
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMIVL
-
-; PR31551
-; Pairs of shufflevector:trunc functions with functional equivalence.
-; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
-
-define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512F-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
-; AVX512VL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,5,7]
-; AVX512BWVL-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
-; AVX512BWVL-NEXT: vmovdqa %ymm2, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
-; AVX512VBMI-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
-; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VBMI-NEXT: vmovdqa %ymm0, (%rsi)
-; AVX512VBMI-NEXT: vzeroupper
-; AVX512VBMI-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v32i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62]
-; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <64 x i8>, <64 x i8>* %L
- %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
- store <32 x i8> %strided.vec, <32 x i8>* %S
- ret void
-}
-
-define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
-; AVX512F-LABEL: trunc_v32i16_to_v32i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512F-NEXT: vpmovdb %zmm1, 16(%rsi)
-; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm1, 16(%rsi)
-; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512VBMI-NEXT: vpmovwb %zmm0, (%rsi)
-; AVX512VBMI-NEXT: vzeroupper
-; AVX512VBMI-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_v32i16_to_v32i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512VBMIVL-NEXT: vpmovwb %zmm0, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <64 x i8>, <64 x i8>* %L
- %bc = bitcast <64 x i8> %vec to <32 x i16>
- %strided.vec = trunc <32 x i16> %bc to <32 x i8>
- store <32 x i8> %strided.vec, <32 x i8>* %S
- ret void
-}
-
-define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512F-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512F-NEXT: vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
-; AVX512F-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
-; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14]
-; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
-; AVX512VL-NEXT: vmovdqa %ymm2, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
-; AVX512BW-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512BW-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,4,5,6,7,8,9,12,13,12,13,14,15,16,17,20,21,20,21,22,23,24,25,28,29,28,29,30,31]
-; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX512VBMI-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
-; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512VBMI-NEXT: vzeroupper
-; AVX512VBMI-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v16i16:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]
-; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <32 x i16>, <32 x i16>* %L
- %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
- store <16 x i16> %strided.vec, <16 x i16>* %S
- ret void
-}
-
-define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
-; AVX512-LABEL: trunc_v16i32_to_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT: vpmovdw %zmm0, (%rsi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %vec = load <32 x i16>, <32 x i16>* %L
- %bc = bitcast <32 x i16> %vec to <16 x i32>
- %strided.vec = trunc <16 x i32> %bc to <16 x i16>
- store <16 x i16> %strided.vec, <16 x i16>* %S
- ret void
-}
-
-define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps (%rdi), %ymm0
-; AVX512F-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
-; AVX512F-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512F-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
-; AVX512VL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
-; AVX512VL-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
-; AVX512BW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
-; AVX512BW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512BW-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
-; AVX512BWVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovaps (%rdi), %ymm0
-; AVX512VBMI-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],mem[0,2],ymm0[4,6],mem[4,6]
-; AVX512VBMI-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX512VBMI-NEXT: vmovaps %ymm0, (%rsi)
-; AVX512VBMI-NEXT: vzeroupper
-; AVX512VBMI-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v16i32_to_v8i32:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14]
-; AVX512VBMIVL-NEXT: vpermi2d 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovdqa %ymm1, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <16 x i32>, <16 x i32>* %L
- %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
- store <8 x i32> %strided.vec, <8 x i32>* %S
- ret void
-}
-
-define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
-; AVX512-LABEL: trunc_v8i64_to_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, (%rsi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %vec = load <16 x i32>, <16 x i32>* %L
- %bc = bitcast <16 x i32> %vec to <8 x i64>
- %strided.vec = trunc <8 x i64> %bc to <8 x i32>
- store <8 x i32> %strided.vec, <8 x i32>* %S
- ret void
-}
-
-define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512VBMI-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <64 x i8>, <64 x i8>* %L
- %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
- store <16 x i8> %strided.vec, <16 x i8>* %S
- ret void
-}
-
-define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
-; AVX512-LABEL: trunc_v16i32_to_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %vec = load <64 x i8>, <64 x i8>* %L
- %bc = bitcast <64 x i8> %vec to <16 x i32>
- %strided.vec = trunc <16 x i32> %bc to <16 x i8>
- store <16 x i8> %strided.vec, <16 x i8>* %S
- ret void
-}
-
-define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[0,2,2,3]
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
-; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX512VBMI-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <32 x i16>, <32 x i16>* %L
- %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
- store <8 x i16> %strided.vec, <8 x i16>* %S
- ret void
-}
-
-define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
-; AVX512-LABEL: trunc_v8i64_to_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %vec = load <32 x i16>, <32 x i16>* %L
- %bc = bitcast <32 x i16> %vec to <8 x i64>
- %strided.vec = trunc <8 x i64> %bc to <8 x i16>
- store <8 x i16> %strided.vec, <8 x i16>* %S
- ret void
-}
-
-define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm0, (%rsi)
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi)
-; AVX512VBMI-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
-; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %vec = load <64 x i8>, <64 x i8>* %L
- %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
- store <8 x i8> %strided.vec, <8 x i8>* %S
- ret void
-}
-
-define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
-; AVX512-LABEL: trunc_v8i64_to_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
-; AVX512-NEXT: vpmovqb %zmm0, (%rsi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %vec = load <64 x i8>, <64 x i8>* %L
- %bc = bitcast <64 x i8> %vec to <8 x i64>
- %strided.vec = trunc <8 x i64> %bc to <8 x i8>
- store <8 x i8> %strided.vec, <8 x i8>* %S
- ret void
-}
-
-define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
-; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMI-NEXT: vzeroupper
-; AVX512VBMI-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
-; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
-; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
- ret <16 x i8> %res
-}
-
-define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
-; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm2
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
-; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMI-NEXT: vzeroupper
-; AVX512VBMI-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
-; AVX512VBMIVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VBMIVL-NEXT: vpermt2b %ymm2, %ymm1, %ymm0
-; AVX512VBMIVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512VBMIVL-NEXT: vzeroupper
-; AVX512VBMIVL-NEXT: retq
- %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
- ret <16 x i8> %res
-}
-
-define <4 x double> @PR34175(<32 x i16>* %p) {
-; AVX512F-LABEL: PR34175:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqu 32(%rdi), %xmm1
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512F-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512F-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: PR34175:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqu 32(%rdi), %xmm1
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,4,2,3]
-; AVX512VL-NEXT: vpermi2d %xmm1, %xmm0, %xmm2
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX512VL-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: PR34175:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqu 32(%rdi), %xmm1
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512BW-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512BW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512BW-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: PR34175:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
-; AVX512BWVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
-; AVX512BWVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512BWVL-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512BWVL-NEXT: retq
-;
-; AVX512VBMI-LABEL: PR34175:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512VBMI-NEXT: vmovdqu 32(%rdi), %xmm1
-; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
-; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
-; AVX512VBMI-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX512VBMI-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512VBMI-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512VBMI-NEXT: retq
-;
-; AVX512VBMIVL-LABEL: PR34175:
-; AVX512VBMIVL: # %bb.0:
-; AVX512VBMIVL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [6755468161056768,6755468161056768,6755468161056768,6755468161056768]
-; AVX512VBMIVL-NEXT: vpermi2w 32(%rdi), %ymm0, %ymm1
-; AVX512VBMIVL-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512VBMIVL-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX512VBMIVL-NEXT: retq
- %v = load <32 x i16>, <32 x i16>* %p, align 2
- %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
- %tofp = uitofp <4 x i16> %shuf to <4 x double>
- ret <4 x double> %tofp
-}
-
-define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
-; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %truncated = trunc <8 x i64> %vec to <8 x i8>
- %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- ret <16 x i8> %result
-}
-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx | FileCheck %s
-; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
define <8 x float> @cvt_v8i8_v8f32(<8 x i8> %src) {
; CHECK-LABEL: cvt_v8i8_v8f32:
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v8i8_v8f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm1
-; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-WIDE-NEXT: retl
%res = sitofp <8 x i8> %src to <8 x float>
ret <8 x float> %res
}
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v8i16_v8f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpmovsxwd %xmm0, %xmm1
-; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; CHECK-WIDE-NEXT: vpmovsxwd %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-WIDE-NEXT: retl
%res = sitofp <8 x i16> %src to <8 x float>
ret <8 x float> %res
}
; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v4i8_v4f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = sitofp <4 x i8> %src to <4 x float>
ret <4 x float> %res
}
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v4i16_v4f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpmovsxwd %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = sitofp <4 x i16> %src to <4 x float>
ret <4 x float> %res
}
; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v8u8_v8f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-WIDE-NEXT: retl
%res = uitofp <8 x i8> %src to <8 x float>
ret <8 x float> %res
}
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v8u16_v8f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; CHECK-WIDE-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; CHECK-WIDE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-WIDE-NEXT: vcvtdq2ps %ymm0, %ymm0
-; CHECK-WIDE-NEXT: retl
%res = uitofp <8 x i16> %src to <8 x float>
ret <8 x float> %res
}
; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v4u8_v4f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = uitofp <4 x i8> %src to <4 x float>
ret <4 x float> %res
}
; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v4u16_v4f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = uitofp <4 x i16> %src to <4 x float>
ret <4 x float> %res
}
; CHECK-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v8f32_v8i8:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0
-; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vzeroupper
-; CHECK-WIDE-NEXT: retl
%res = fptosi <8 x float> %src to <8 x i8>
ret <8 x i8> %res
}
; CHECK-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v8f32_v8i16:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0
-; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vzeroupper
-; CHECK-WIDE-NEXT: retl
%res = fptosi <8 x float> %src to <8 x i16>
ret <8 x i16> %res
}
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v4f32_v4i8:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; CHECK-WIDE-NEXT: retl
%res = fptosi <4 x float> %src to <4 x i8>
ret <4 x i8> %res
}
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v4f32_v4i16:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = fptosi <4 x float> %src to <4 x i16>
ret <4 x i16> %res
}
; CHECK-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v8f32_v8u8:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0
-; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-WIDE-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vzeroupper
-; CHECK-WIDE-NEXT: retl
%res = fptoui <8 x float> %src to <8 x i8>
ret <8 x i8> %res
}
; CHECK-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v8f32_v8u16:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %ymm0, %ymm0
-; CHECK-WIDE-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-WIDE-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vzeroupper
-; CHECK-WIDE-NEXT: retl
%res = fptoui <8 x float> %src to <8 x i16>
ret <8 x i16> %res
}
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v4f32_v4u8:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; CHECK-WIDE-NEXT: retl
%res = fptoui <4 x float> %src to <4 x i8>
ret <4 x i8> %res
}
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v4f32_v4u16:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = fptoui <4 x float> %src to <4 x i16>
ret <4 x i16> %res
}
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx | FileCheck %s
-; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+avx -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=CHECK-WIDE
define <2 x float> @cvt_v2i8_v2f32(<2 x i8> %src) {
; CHECK-LABEL: cvt_v2i8_v2f32:
; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v2i8_v2f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpmovsxbd %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = sitofp <2 x i8> %src to <2 x float>
ret <2 x float> %res
}
; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v2i16_v2f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpmovsxwd %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = sitofp <2 x i16> %src to <2 x float>
ret <2 x float> %res
}
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v2i32_v2f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = sitofp <2 x i32> %src to <2 x float>
ret <2 x float> %res
}
; CHECK-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v2u8_v2f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = uitofp <2 x i8> %src to <2 x float>
ret <2 x float> %res
}
; CHECK-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v2u16_v2f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-WIDE-NEXT: vcvtdq2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = uitofp <2 x i16> %src to <2 x float>
ret <2 x float> %res
}
; CHECK-NEXT: vsubpd %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v2u32_v2f32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; CHECK-WIDE-NEXT: vmovdqa {{.*#+}} xmm1 = [4.503599627370496E+15,4.503599627370496E+15]
-; CHECK-WIDE-NEXT: vpor %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vsubpd %xmm1, %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vcvtpd2ps %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = uitofp <2 x i32> %src to <2 x float>
ret <2 x float> %res
}
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v2f32_v2i8:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; CHECK-WIDE-NEXT: retl
%res = fptosi <2 x float> %src to <2 x i8>
ret <2 x i8> %res
}
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v2f32_v2i16:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-WIDE-NEXT: retl
%res = fptosi <2 x float> %src to <2 x i16>
ret <2 x i16> %res
}
; CHECK: ## %bb.0:
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v2f32_v2i32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = fptosi <2 x float> %src to <2 x i32>
ret <2 x i32> %res
}
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v2f32_v2u8:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; CHECK-WIDE-NEXT: retl
%res = fptoui <2 x float> %src to <2 x i8>
ret <2 x i8> %res
}
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v2f32_v2u16:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; CHECK-WIDE-NEXT: retl
%res = fptoui <2 x float> %src to <2 x i16>
ret <2 x i16> %res
}
; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0
; CHECK-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: cvt_v2f32_v2u32:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; CHECK-WIDE-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
-; CHECK-WIDE-NEXT: vsubps %xmm1, %xmm0, %xmm1
-; CHECK-WIDE-NEXT: vcvttps2dq %xmm1, %xmm1
-; CHECK-WIDE-NEXT: vxorps LCPI11_1, %xmm1, %xmm1
-; CHECK-WIDE-NEXT: vcvttps2dq %xmm0, %xmm0
-; CHECK-WIDE-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; CHECK-WIDE-NEXT: retl
%res = fptoui <2 x float> %src to <2 x i32>
ret <2 x i32> %res
}
; CHECK-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: retl
-;
-; CHECK-WIDE-LABEL: PR40146:
-; CHECK-WIDE: ## %bb.0:
-; CHECK-WIDE-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; CHECK-WIDE-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-WIDE-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; CHECK-WIDE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-WIDE-NEXT: retl
%perm = shufflevector <4 x i64> %x, <4 x i64> undef, <4 x i32> <i32 0, i32 undef, i32 1, i32 undef>
%t1 = bitcast <4 x i64> %perm to <32 x i8>
%t2 = shufflevector <32 x i8> %t1, <32 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <32 x i32> <i32 0, i32 32, i32 1, i32 32, i32 2, i32 32, i32 3, i32 32, i32 4, i32 32, i32 5, i32 32, i32 6, i32 32, i32 7, i32 32, i32 16, i32 48, i32 17, i32 48, i32 18, i32 48, i32 19, i32 48, i32 20, i32 48, i32 21, i32 48, i32 22, i32 48, i32 23, i32 48>
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VLDQ
-;
-; 32-bit tests to make sure we're not doing anything stupid.
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2
-
-;
-; Double to Signed Integer
-;
-
-define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
-; SSE-LABEL: fptosi_2f64_to_2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; VEX-LABEL: fptosi_2f64_to_2i64:
-; VEX: # %bb.0:
-; VEX-NEXT: vcvttsd2si %xmm0, %rax
-; VEX-NEXT: vmovq %rax, %xmm1
-; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; VEX-NEXT: vcvttsd2si %xmm0, %rax
-; VEX-NEXT: vmovq %rax, %xmm0
-; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_2f64_to_2i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttsd2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vcvttsd2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_2f64_to_2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_2f64_to_2i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f64_to_2i64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2qq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = fptosi <2 x double> %a to <2 x i64>
- ret <2 x i64> %cvt
-}
-
-define <4 x i32> @fptosi_2f64_to_4i32(<2 x double> %a) {
-; SSE-LABEL: fptosi_2f64_to_4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_2f64_to_4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: retq
- %cvt = fptosi <2 x double> %a to <2 x i32>
- %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i32> %ext
-}
-
-define <2 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) {
-; SSE-LABEL: fptosi_2f64_to_2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_2f64_to_2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: retq
- %cvt = fptosi <2 x double> %a to <2 x i32>
- ret <2 x i32> %cvt
-}
-
-define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
-; SSE-LABEL: fptosi_4f64_to_2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_4f64_to_2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
- %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- %cvt = fptosi <4 x double> %ext to <4 x i32>
- ret <4 x i32> %cvt
-}
-
-define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
-; SSE-LABEL: fptosi_4f64_to_4i64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm2
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movq %rax, %xmm3
-; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: movdqa %xmm3, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptosi_4f64_to_4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vcvttsd2si %xmm1, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX1-NEXT: vcvttsd2si %xmm1, %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vcvttsd2si %xmm0, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vcvttsd2si %xmm0, %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptosi_4f64_to_4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vcvttsd2si %xmm1, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX2-NEXT: vcvttsd2si %xmm1, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vcvttsd2si %xmm0, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvttsd2si %xmm0, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_4f64_to_4i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vcvttsd2si %xmm1, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm2
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vcvttsd2si %xmm1, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT: vcvttsd2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm2
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vcvttsd2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_4f64_to_4i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm2
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-NEXT: vcvttsd2si %xmm1, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm2
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvttsd2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_4f64_to_4i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT: vcvttpd2qq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_4f64_to_4i64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2qq %ymm0, %ymm0
-; AVX512VLDQ-NEXT: retq
- %cvt = fptosi <4 x double> %a to <4 x i64>
- ret <4 x i64> %cvt
-}
-
-define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) {
-; SSE-LABEL: fptosi_4f64_to_4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttpd2dq %xmm1, %xmm1
-; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_4f64_to_4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
- %cvt = fptosi <4 x double> %a to <4 x i32>
- ret <4 x i32> %cvt
-}
-
-;
-; Double to Unsigned Integer
-;
-
-define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
-; SSE-LABEL: fptoui_2f64_to_2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero
-; SSE-NEXT: movapd %xmm0, %xmm1
-; SSE-NEXT: subsd %xmm2, %xmm1
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttsd2si %xmm0, %rdx
-; SSE-NEXT: ucomisd %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movq %rdx, %xmm1
-; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: movapd %xmm0, %xmm3
-; SSE-NEXT: subsd %xmm2, %xmm3
-; SSE-NEXT: cvttsd2si %xmm3, %rax
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttsd2si %xmm0, %rcx
-; SSE-NEXT: ucomisd %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movq %rcx, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; VEX-LABEL: fptoui_2f64_to_2i64:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm2
-; VEX-NEXT: vcvttsd2si %xmm2, %rax
-; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttsd2si %xmm0, %rdx
-; VEX-NEXT: vucomisd %xmm1, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rdx
-; VEX-NEXT: vmovq %rdx, %xmm2
-; VEX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; VEX-NEXT: vsubsd %xmm1, %xmm0, %xmm3
-; VEX-NEXT: vcvttsd2si %xmm3, %rax
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttsd2si %xmm0, %rcx
-; VEX-NEXT: vucomisd %xmm1, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rcx
-; VEX-NEXT: vmovq %rcx, %xmm0
-; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_2f64_to_2i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_2f64_to_2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_2f64_to_2i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f64_to_2i64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2uqq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = fptoui <2 x double> %a to <2 x i64>
- ret <2 x i64> %cvt
-}
-
-define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
-; SSE-LABEL: fptoui_2f64_to_4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: cvttsd2si %xmm0, %rcx
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movd %ecx, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptoui_2f64_to_4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm2
-; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3
-; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_2f64_to_4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpackssdw %xmm0, %xmm2, %xmm2
-; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_2f64_to_4i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512F-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_2f64_to_4i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_2f64_to_4i32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f64_to_4i32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = fptoui <2 x double> %a to <2 x i32>
- %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i32> %ext
-}
-
-define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
-; SSE-LABEL: fptoui_2f64_to_2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptoui_2f64_to_2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm3
-; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vblendvps %xmm2, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_2f64_to_2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_2f64_to_2i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_2f64_to_2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttpd2udq %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_2f64_to_2i32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f64_to_2i32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2udq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = fptoui <2 x double> %a to <2 x i32>
- %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- ret <4 x i32> %ext
-}
-
-define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
-; SSE-LABEL: fptoui_4f64_to_2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptoui_4f64_to_2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovapd %xmm0, %xmm0
-; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm1
-; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1
-; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_4f64_to_2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovapd %xmm0, %xmm0
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_4f64_to_2i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps %xmm0, %xmm0
-; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_4f64_to_2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovaps %xmm0, %xmm0
-; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_4f64_to_2i32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps %xmm0, %xmm0
-; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vmovaps %xmm0, %xmm0
-; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0
-; AVX512VLDQ-NEXT: vzeroupper
-; AVX512VLDQ-NEXT: retq
- %ext = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %cvt = fptoui <4 x double> %ext to <4 x i32>
- ret <4 x i32> %cvt
-}
-
-define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
-; SSE-LABEL: fptoui_4f64_to_4i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movapd %xmm0, %xmm2
-; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero
-; SSE-NEXT: subsd %xmm3, %xmm0
-; SSE-NEXT: cvttsd2si %xmm0, %rcx
-; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttsd2si %xmm2, %rdx
-; SSE-NEXT: ucomisd %xmm3, %xmm2
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm0
-; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
-; SSE-NEXT: movapd %xmm2, %xmm4
-; SSE-NEXT: subsd %xmm3, %xmm4
-; SSE-NEXT: cvttsd2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttsd2si %xmm2, %rdx
-; SSE-NEXT: ucomisd %xmm3, %xmm2
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm2
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT: movapd %xmm1, %xmm2
-; SSE-NEXT: subsd %xmm3, %xmm2
-; SSE-NEXT: cvttsd2si %xmm2, %rcx
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttsd2si %xmm1, %rdx
-; SSE-NEXT: ucomisd %xmm3, %xmm1
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm2
-; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT: movapd %xmm1, %xmm4
-; SSE-NEXT: subsd %xmm3, %xmm4
-; SSE-NEXT: cvttsd2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: ucomisd %xmm3, %xmm1
-; SSE-NEXT: cmovaeq %rcx, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptoui_4f64_to_4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vcvttsd2si %xmm3, %rax
-; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX1-NEXT: xorq %rcx, %rax
-; AVX1-NEXT: vcvttsd2si %xmm2, %rdx
-; AVX1-NEXT: vucomisd %xmm1, %xmm2
-; AVX1-NEXT: cmovaeq %rax, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm3
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX1-NEXT: vsubsd %xmm1, %xmm2, %xmm4
-; AVX1-NEXT: vcvttsd2si %xmm4, %rax
-; AVX1-NEXT: xorq %rcx, %rax
-; AVX1-NEXT: vcvttsd2si %xmm2, %rdx
-; AVX1-NEXT: vucomisd %xmm1, %xmm2
-; AVX1-NEXT: cmovaeq %rax, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vcvttsd2si %xmm3, %rax
-; AVX1-NEXT: xorq %rcx, %rax
-; AVX1-NEXT: vcvttsd2si %xmm0, %rdx
-; AVX1-NEXT: vucomisd %xmm1, %xmm0
-; AVX1-NEXT: cmovaeq %rax, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm3
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX1-NEXT: vsubsd %xmm1, %xmm0, %xmm4
-; AVX1-NEXT: vcvttsd2si %xmm4, %rax
-; AVX1-NEXT: xorq %rcx, %rax
-; AVX1-NEXT: vcvttsd2si %xmm0, %rcx
-; AVX1-NEXT: vucomisd %xmm1, %xmm0
-; AVX1-NEXT: cmovaeq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_4f64_to_4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vcvttsd2si %xmm3, %rax
-; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX2-NEXT: xorq %rcx, %rax
-; AVX2-NEXT: vcvttsd2si %xmm2, %rdx
-; AVX2-NEXT: vucomisd %xmm1, %xmm2
-; AVX2-NEXT: cmovaeq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm3
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
-; AVX2-NEXT: vsubsd %xmm1, %xmm2, %xmm4
-; AVX2-NEXT: vcvttsd2si %xmm4, %rax
-; AVX2-NEXT: xorq %rcx, %rax
-; AVX2-NEXT: vcvttsd2si %xmm2, %rdx
-; AVX2-NEXT: vucomisd %xmm1, %xmm2
-; AVX2-NEXT: cmovaeq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm3
-; AVX2-NEXT: vcvttsd2si %xmm3, %rax
-; AVX2-NEXT: xorq %rcx, %rax
-; AVX2-NEXT: vcvttsd2si %xmm0, %rdx
-; AVX2-NEXT: vucomisd %xmm1, %xmm0
-; AVX2-NEXT: cmovaeq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm3
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vsubsd %xmm1, %xmm0, %xmm4
-; AVX2-NEXT: vcvttsd2si %xmm4, %rax
-; AVX2-NEXT: xorq %rcx, %rax
-; AVX2-NEXT: vcvttsd2si %xmm0, %rcx
-; AVX2-NEXT: vucomisd %xmm1, %xmm0
-; AVX2-NEXT: cmovaeq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_4f64_to_4i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm2
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512F-NEXT: vcvttsd2usi %xmm1, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm2
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512F-NEXT: vcvttsd2usi %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_4f64_to_4i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm2
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX512VL-NEXT: vcvttsd2usi %xmm1, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm2
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX512VL-NEXT: vcvttsd2usi %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_4f64_to_4i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT: vcvttpd2uqq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_4f64_to_4i64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2uqq %ymm0, %ymm0
-; AVX512VLDQ-NEXT: retq
- %cvt = fptoui <4 x double> %a to <4 x i64>
- ret <4 x i64> %cvt
-}
-
-define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
-; SSE-LABEL: fptoui_4f64_to_4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptoui_4f64_to_4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX1-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vsubpd %ymm1, %ymm0, %ymm1
-; AVX1-NEXT: vcvttpd2dq %ymm1, %xmm1
-; AVX1-NEXT: vxorpd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_4f64_to_4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [2.147483648E+9,2.147483648E+9,2.147483648E+9,2.147483648E+9]
-; AVX2-NEXT: vcmpltpd %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX2-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vsubpd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vcvttpd2dq %ymm1, %xmm1
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vxorpd %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vcvttpd2dq %ymm0, %xmm0
-; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_4f64_to_4i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_4f64_to_4i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttpd2udq %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_4f64_to_4i32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT: vcvttpd2udq %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_4f64_to_4i32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2udq %ymm0, %xmm0
-; AVX512VLDQ-NEXT: vzeroupper
-; AVX512VLDQ-NEXT: retq
- %cvt = fptoui <4 x double> %a to <4 x i32>
- ret <4 x i32> %cvt
-}
-
-;
-; Float to Signed Integer
-;
-
-define <2 x i32> @fptosi_2f32_to_2i32(<2 x float> %a) {
-; SSE-LABEL: fptosi_2f32_to_2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_2f32_to_2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: retq
- %cvt = fptosi <2 x float> %a to <2 x i32>
- ret <2 x i32> %cvt
-}
-
-define <4 x i32> @fptosi_4f32_to_4i32(<4 x float> %a) {
-; SSE-LABEL: fptosi_4f32_to_4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_4f32_to_4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: retq
- %cvt = fptosi <4 x float> %a to <4 x i32>
- ret <4 x i32> %cvt
-}
-
-define <2 x i64> @fptosi_2f32_to_2i64(<4 x float> %a) {
-; SSE-LABEL: fptosi_2f32_to_2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; VEX-LABEL: fptosi_2f32_to_2i64:
-; VEX: # %bb.0:
-; VEX-NEXT: vcvttss2si %xmm0, %rax
-; VEX-NEXT: vmovq %rax, %xmm1
-; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; VEX-NEXT: vcvttss2si %xmm0, %rax
-; VEX-NEXT: vmovq %rax, %xmm0
-; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_2f32_to_2i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttss2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvttss2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_2f32_to_2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_2f32_to_2i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
- %cvt = fptosi <2 x float> %shuf to <2 x i64>
- ret <2 x i64> %cvt
-}
-
-define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
-; SSE-LABEL: fptosi_4f32_to_2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; VEX-LABEL: fptosi_4f32_to_2i64:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; VEX-NEXT: vcvttss2si %xmm1, %rax
-; VEX-NEXT: vcvttss2si %xmm0, %rcx
-; VEX-NEXT: vmovq %rcx, %xmm0
-; VEX-NEXT: vmovq %rax, %xmm1
-; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_4f32_to_2i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvttss2si %xmm1, %rax
-; AVX512F-NEXT: vcvttss2si %xmm0, %rcx
-; AVX512F-NEXT: vmovq %rcx, %xmm0
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_4f32_to_2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx
-; AVX512VL-NEXT: vmovq %rcx, %xmm0
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_4f32_to_2i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0
-; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512VLDQ-NEXT: vzeroupper
-; AVX512VLDQ-NEXT: retq
- %cvt = fptosi <4 x float> %a to <4 x i64>
- %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
- ret <2 x i64> %shuf
-}
-
-define <8 x i32> @fptosi_8f32_to_8i32(<8 x float> %a) {
-; SSE-LABEL: fptosi_8f32_to_8i32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_8f32_to_8i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %ymm0, %ymm0
-; AVX-NEXT: retq
- %cvt = fptosi <8 x float> %a to <8 x i32>
- ret <8 x i32> %cvt
-}
-
-define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
-; SSE-LABEL: fptosi_4f32_to_4i64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm2
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movq %rax, %xmm3
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptosi_4f32_to_4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvttss2si %xmm1, %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1-NEXT: vcvttss2si %xmm2, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vcvttss2si %xmm0, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvttss2si %xmm0, %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptosi_4f32_to_4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvttss2si %xmm1, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX2-NEXT: vcvttss2si %xmm2, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vcvttss2si %xmm0, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvttss2si %xmm0, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_4f32_to_4i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvttss2si %xmm1, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512F-NEXT: vcvttss2si %xmm2, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT: vcvttss2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm2
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvttss2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_4f32_to_4i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512VL-NEXT: vcvttss2si %xmm2, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm2
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_4f32_to_4i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_4f32_to_4i64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2qq %xmm0, %ymm0
-; AVX512VLDQ-NEXT: retq
- %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %cvt = fptosi <4 x float> %shuf to <4 x i64>
- ret <4 x i64> %cvt
-}
-
-define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
-; SSE-LABEL: fptosi_8f32_to_4i64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm2
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movq %rax, %xmm3
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptosi_8f32_to_4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX1-NEXT: vcvttss2si %xmm1, %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX1-NEXT: vcvttss2si %xmm2, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT: vcvttss2si %xmm0, %rax
-; AVX1-NEXT: vmovq %rax, %xmm2
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vcvttss2si %xmm0, %rax
-; AVX1-NEXT: vmovq %rax, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptosi_8f32_to_4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX2-NEXT: vcvttss2si %xmm1, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX2-NEXT: vcvttss2si %xmm2, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vcvttss2si %xmm0, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvttss2si %xmm0, %rax
-; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_8f32_to_4i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvttss2si %xmm1, %rax
-; AVX512F-NEXT: vcvttss2si %xmm0, %rcx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvttss2si %xmm1, %rdx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvttss2si %xmm0, %rsi
-; AVX512F-NEXT: vmovq %rsi, %xmm0
-; AVX512F-NEXT: vmovq %rdx, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vmovq %rcx, %xmm1
-; AVX512F-NEXT: vmovq %rax, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_8f32_to_4i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvttss2si %xmm1, %rax
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rcx
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvttss2si %xmm1, %rdx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rsi
-; AVX512VL-NEXT: vmovq %rsi, %xmm0
-; AVX512VL-NEXT: vmovq %rdx, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vmovq %rcx, %xmm1
-; AVX512VL-NEXT: vmovq %rax, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_8f32_to_4i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_8f32_to_4i64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512VLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = fptosi <8 x float> %a to <8 x i64>
- %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i64> %shuf
-}
-
-;
-; Float to Unsigned Integer
-;
-
-define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
-; SSE-LABEL: fptoui_2f32_to_2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: cmpltps %xmm2, %xmm1
-; SSE-NEXT: cvttps2dq %xmm0, %xmm3
-; SSE-NEXT: subps %xmm2, %xmm0
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: xorps {{.*}}(%rip), %xmm0
-; SSE-NEXT: andps %xmm1, %xmm3
-; SSE-NEXT: andnps %xmm0, %xmm1
-; SSE-NEXT: orps %xmm3, %xmm1
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptoui_2f32_to_2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_2f32_to_2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_2f32_to_2i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_2f32_to_2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_2f32_to_2i32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = fptoui <2 x float> %a to <2 x i32>
- ret <2 x i32> %cvt
-}
-
-define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
-; SSE-LABEL: fptoui_4f32_to_4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm2 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: cmpltps %xmm2, %xmm1
-; SSE-NEXT: cvttps2dq %xmm0, %xmm3
-; SSE-NEXT: subps %xmm2, %xmm0
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: xorps {{.*}}(%rip), %xmm0
-; SSE-NEXT: andps %xmm1, %xmm3
-; SSE-NEXT: andnps %xmm0, %xmm1
-; SSE-NEXT: orps %xmm3, %xmm1
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptoui_4f32_to_4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX1-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vsubps %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_4f32_to_4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX2-NEXT: vcmpltps %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vsubps %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vxorps %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX2-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_4f32_to_4i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_4f32_to_4i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttps2udq %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_4f32_to_4i32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_4f32_to_4i32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2udq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = fptoui <4 x float> %a to <4 x i32>
- ret <4 x i32> %cvt
-}
-
-define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
-; SSE-LABEL: fptoui_2f32_to_2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: subss %xmm2, %xmm1
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttss2si %xmm0, %rdx
-; SSE-NEXT: ucomiss %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movq %rdx, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: subss %xmm2, %xmm3
-; SSE-NEXT: cvttss2si %xmm3, %rax
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttss2si %xmm0, %rcx
-; SSE-NEXT: ucomiss %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movq %rcx, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; VEX-LABEL: fptoui_2f32_to_2i64:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2
-; VEX-NEXT: vcvttss2si %xmm2, %rax
-; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttss2si %xmm0, %rdx
-; VEX-NEXT: vucomiss %xmm1, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rdx
-; VEX-NEXT: vmovq %rdx, %xmm2
-; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3
-; VEX-NEXT: vcvttss2si %xmm3, %rax
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttss2si %xmm0, %rcx
-; VEX-NEXT: vucomiss %xmm1, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rcx
-; VEX-NEXT: vmovq %rcx, %xmm0
-; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_2f32_to_2i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_2f32_to_2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_2f32_to_2i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %shuf = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> <i32 0, i32 1>
- %cvt = fptoui <2 x float> %shuf to <2 x i64>
- ret <2 x i64> %cvt
-}
-
-define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
-; SSE-LABEL: fptoui_4f32_to_2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: subss %xmm2, %xmm1
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttss2si %xmm0, %rdx
-; SSE-NEXT: ucomiss %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movq %rdx, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: subss %xmm2, %xmm3
-; SSE-NEXT: cvttss2si %xmm3, %rax
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttss2si %xmm0, %rcx
-; SSE-NEXT: ucomiss %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movq %rcx, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; VEX-LABEL: fptoui_4f32_to_2i64:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; VEX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; VEX-NEXT: vsubss %xmm2, %xmm1, %xmm3
-; VEX-NEXT: vcvttss2si %xmm3, %rax
-; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttss2si %xmm1, %rdx
-; VEX-NEXT: vucomiss %xmm2, %xmm1
-; VEX-NEXT: cmovaeq %rax, %rdx
-; VEX-NEXT: vsubss %xmm2, %xmm0, %xmm1
-; VEX-NEXT: vcvttss2si %xmm1, %rax
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttss2si %xmm0, %rcx
-; VEX-NEXT: vucomiss %xmm2, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rcx
-; VEX-NEXT: vmovq %rcx, %xmm0
-; VEX-NEXT: vmovq %rdx, %xmm1
-; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_4f32_to_2i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvttss2usi %xmm1, %rax
-; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx
-; AVX512F-NEXT: vmovq %rcx, %xmm0
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_4f32_to_2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax
-; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx
-; AVX512VL-NEXT: vmovq %rcx, %xmm0
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_4f32_to_2i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0
-; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512VLDQ-NEXT: vzeroupper
-; AVX512VLDQ-NEXT: retq
- %cvt = fptoui <4 x float> %a to <4 x i64>
- %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
- ret <2 x i64> %shuf
-}
-
-define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
-; SSE-LABEL: fptoui_8f32_to_8i32:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm4 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: cmpltps %xmm4, %xmm2
-; SSE-NEXT: cvttps2dq %xmm0, %xmm3
-; SSE-NEXT: subps %xmm4, %xmm0
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: movaps {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; SSE-NEXT: xorps %xmm5, %xmm0
-; SSE-NEXT: andps %xmm2, %xmm3
-; SSE-NEXT: andnps %xmm0, %xmm2
-; SSE-NEXT: orps %xmm3, %xmm2
-; SSE-NEXT: movaps %xmm1, %xmm3
-; SSE-NEXT: cmpltps %xmm4, %xmm3
-; SSE-NEXT: cvttps2dq %xmm1, %xmm0
-; SSE-NEXT: subps %xmm4, %xmm1
-; SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE-NEXT: xorps %xmm5, %xmm1
-; SSE-NEXT: andps %xmm3, %xmm0
-; SSE-NEXT: andnps %xmm1, %xmm3
-; SSE-NEXT: orps %xmm0, %xmm3
-; SSE-NEXT: movaps %xmm2, %xmm0
-; SSE-NEXT: movaps %xmm3, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptoui_8f32_to_8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX1-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
-; AVX1-NEXT: vsubps %ymm1, %ymm0, %ymm1
-; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1
-; AVX1-NEXT: vxorps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0
-; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_8f32_to_8i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
-; AVX2-NEXT: vcmpltps %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1
-; AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0
-; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_8f32_to_8i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_8f32_to_8i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttps2udq %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_8f32_to_8i32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT: vcvttps2udq %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_8f32_to_8i32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2udq %ymm0, %ymm0
-; AVX512VLDQ-NEXT: retq
- %cvt = fptoui <8 x float> %a to <8 x i32>
- ret <8 x i32> %cvt
-}
-
-define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
-; SSE-LABEL: fptoui_4f32_to_4i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: subss %xmm1, %xmm2
-; SSE-NEXT: cvttss2si %xmm2, %rcx
-; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttss2si %xmm0, %rdx
-; SSE-NEXT: ucomiss %xmm1, %xmm0
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm2
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; SSE-NEXT: movaps %xmm3, %xmm4
-; SSE-NEXT: subss %xmm1, %xmm4
-; SSE-NEXT: cvttss2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttss2si %xmm3, %rdx
-; SSE-NEXT: ucomiss %xmm1, %xmm3
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm3
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3]
-; SSE-NEXT: movaps %xmm3, %xmm4
-; SSE-NEXT: subss %xmm1, %xmm4
-; SSE-NEXT: cvttss2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttss2si %xmm3, %rdx
-; SSE-NEXT: ucomiss %xmm1, %xmm3
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm3
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: movaps %xmm0, %xmm4
-; SSE-NEXT: subss %xmm1, %xmm4
-; SSE-NEXT: cvttss2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: ucomiss %xmm1, %xmm0
-; SSE-NEXT: cmovaeq %rcx, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptoui_4f32_to_4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vcvttss2si %xmm3, %rax
-; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX1-NEXT: xorq %rcx, %rax
-; AVX1-NEXT: vcvttss2si %xmm2, %rdx
-; AVX1-NEXT: vucomiss %xmm1, %xmm2
-; AVX1-NEXT: cmovaeq %rax, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm2
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vcvttss2si %xmm4, %rax
-; AVX1-NEXT: xorq %rcx, %rax
-; AVX1-NEXT: vcvttss2si %xmm3, %rdx
-; AVX1-NEXT: vucomiss %xmm1, %xmm3
-; AVX1-NEXT: cmovaeq %rax, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vcvttss2si %xmm3, %rax
-; AVX1-NEXT: xorq %rcx, %rax
-; AVX1-NEXT: vcvttss2si %xmm0, %rdx
-; AVX1-NEXT: vucomiss %xmm1, %xmm0
-; AVX1-NEXT: cmovaeq %rax, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm3
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4
-; AVX1-NEXT: vcvttss2si %xmm4, %rax
-; AVX1-NEXT: xorq %rcx, %rax
-; AVX1-NEXT: vcvttss2si %xmm0, %rcx
-; AVX1-NEXT: vucomiss %xmm1, %xmm0
-; AVX1-NEXT: cmovaeq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_4f32_to_4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vcvttss2si %xmm3, %rax
-; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX2-NEXT: xorq %rcx, %rax
-; AVX2-NEXT: vcvttss2si %xmm2, %rdx
-; AVX2-NEXT: vucomiss %xmm1, %xmm2
-; AVX2-NEXT: cmovaeq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4
-; AVX2-NEXT: vcvttss2si %xmm4, %rax
-; AVX2-NEXT: xorq %rcx, %rax
-; AVX2-NEXT: vcvttss2si %xmm3, %rdx
-; AVX2-NEXT: vucomiss %xmm1, %xmm3
-; AVX2-NEXT: cmovaeq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3
-; AVX2-NEXT: vcvttss2si %xmm3, %rax
-; AVX2-NEXT: xorq %rcx, %rax
-; AVX2-NEXT: vcvttss2si %xmm0, %rdx
-; AVX2-NEXT: vucomiss %xmm1, %xmm0
-; AVX2-NEXT: cmovaeq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm3
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4
-; AVX2-NEXT: vcvttss2si %xmm4, %rax
-; AVX2-NEXT: xorq %rcx, %rax
-; AVX2-NEXT: vcvttss2si %xmm0, %rcx
-; AVX2-NEXT: vucomiss %xmm1, %xmm0
-; AVX2-NEXT: cmovaeq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_4f32_to_4i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvttss2usi %xmm1, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512F-NEXT: vcvttss2usi %xmm2, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm2
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_4f32_to_4i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX512VL-NEXT: vcvttss2usi %xmm2, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm2
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_4f32_to_4i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_4f32_to_4i64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2uqq %xmm0, %ymm0
-; AVX512VLDQ-NEXT: retq
- %shuf = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %cvt = fptoui <4 x float> %shuf to <4 x i64>
- ret <4 x i64> %cvt
-}
-
-define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
-; SSE-LABEL: fptoui_8f32_to_4i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: subss %xmm1, %xmm2
-; SSE-NEXT: cvttss2si %xmm2, %rcx
-; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttss2si %xmm0, %rdx
-; SSE-NEXT: ucomiss %xmm1, %xmm0
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm2
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; SSE-NEXT: movaps %xmm3, %xmm4
-; SSE-NEXT: subss %xmm1, %xmm4
-; SSE-NEXT: cvttss2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttss2si %xmm3, %rdx
-; SSE-NEXT: ucomiss %xmm1, %xmm3
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm3
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3]
-; SSE-NEXT: movaps %xmm3, %xmm4
-; SSE-NEXT: subss %xmm1, %xmm4
-; SSE-NEXT: cvttss2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttss2si %xmm3, %rdx
-; SSE-NEXT: ucomiss %xmm1, %xmm3
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movq %rdx, %xmm3
-; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; SSE-NEXT: movaps %xmm0, %xmm4
-; SSE-NEXT: subss %xmm1, %xmm4
-; SSE-NEXT: cvttss2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: ucomiss %xmm1, %xmm0
-; SSE-NEXT: cmovaeq %rcx, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE-NEXT: movdqa %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptoui_8f32_to_4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT: vsubss %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vcvttss2si %xmm3, %rax
-; AVX1-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX1-NEXT: xorq %rcx, %rax
-; AVX1-NEXT: vcvttss2si %xmm2, %rdx
-; AVX1-NEXT: vucomiss %xmm1, %xmm2
-; AVX1-NEXT: cmovaeq %rax, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm2
-; AVX1-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX1-NEXT: vsubss %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vcvttss2si %xmm4, %rax
-; AVX1-NEXT: xorq %rcx, %rax
-; AVX1-NEXT: vcvttss2si %xmm3, %rdx
-; AVX1-NEXT: vucomiss %xmm1, %xmm3
-; AVX1-NEXT: cmovaeq %rax, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm3
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vcvttss2si %xmm3, %rax
-; AVX1-NEXT: xorq %rcx, %rax
-; AVX1-NEXT: vcvttss2si %xmm0, %rdx
-; AVX1-NEXT: vucomiss %xmm1, %xmm0
-; AVX1-NEXT: cmovaeq %rax, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm3
-; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm4
-; AVX1-NEXT: vcvttss2si %xmm4, %rax
-; AVX1-NEXT: xorq %rcx, %rax
-; AVX1-NEXT: vcvttss2si %xmm0, %rcx
-; AVX1-NEXT: vucomiss %xmm1, %xmm0
-; AVX1-NEXT: cmovaeq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_8f32_to_4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
-; AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT: vsubss %xmm1, %xmm2, %xmm3
-; AVX2-NEXT: vcvttss2si %xmm3, %rax
-; AVX2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; AVX2-NEXT: xorq %rcx, %rax
-; AVX2-NEXT: vcvttss2si %xmm2, %rdx
-; AVX2-NEXT: vucomiss %xmm1, %xmm2
-; AVX2-NEXT: cmovaeq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm2
-; AVX2-NEXT: vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
-; AVX2-NEXT: vsubss %xmm1, %xmm3, %xmm4
-; AVX2-NEXT: vcvttss2si %xmm4, %rax
-; AVX2-NEXT: xorq %rcx, %rax
-; AVX2-NEXT: vcvttss2si %xmm3, %rdx
-; AVX2-NEXT: vucomiss %xmm1, %xmm3
-; AVX2-NEXT: cmovaeq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm3
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm3
-; AVX2-NEXT: vcvttss2si %xmm3, %rax
-; AVX2-NEXT: xorq %rcx, %rax
-; AVX2-NEXT: vcvttss2si %xmm0, %rdx
-; AVX2-NEXT: vucomiss %xmm1, %xmm0
-; AVX2-NEXT: cmovaeq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm3
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX2-NEXT: vsubss %xmm1, %xmm0, %xmm4
-; AVX2-NEXT: vcvttss2si %xmm4, %rax
-; AVX2-NEXT: xorq %rcx, %rax
-; AVX2-NEXT: vcvttss2si %xmm0, %rcx
-; AVX2-NEXT: vucomiss %xmm1, %xmm0
-; AVX2-NEXT: cmovaeq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm3[0],xmm0[0]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_8f32_to_4i64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvttss2usi %xmm1, %rax
-; AVX512F-NEXT: vcvttss2usi %xmm0, %rcx
-; AVX512F-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-NEXT: vcvttss2usi %xmm1, %rdx
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvttss2usi %xmm0, %rsi
-; AVX512F-NEXT: vmovq %rsi, %xmm0
-; AVX512F-NEXT: vmovq %rdx, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vmovq %rcx, %xmm1
-; AVX512F-NEXT: vmovq %rax, %xmm2
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_8f32_to_4i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvttss2usi %xmm1, %rax
-; AVX512VL-NEXT: vcvttss2usi %xmm0, %rcx
-; AVX512VL-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-NEXT: vcvttss2usi %xmm1, %rdx
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvttss2usi %xmm0, %rsi
-; AVX512VL-NEXT: vmovq %rsi, %xmm0
-; AVX512VL-NEXT: vmovq %rdx, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vmovq %rcx, %xmm1
-; AVX512VL-NEXT: vmovq %rax, %xmm2
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_8f32_to_4i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_8f32_to_4i64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512VLDQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = fptoui <8 x float> %a to <8 x i64>
- %shuf = shufflevector <8 x i64> %cvt, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i64> %shuf
-}
-
-;
-; Constant Folding
-;
-
-define <2 x i64> @fptosi_2f64_to_2i64_const() {
-; SSE-LABEL: fptosi_2f64_to_2i64_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_2f64_to_2i64_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; AVX-NEXT: retq
- %cvt = fptosi <2 x double> <double 1.0, double -1.0> to <2 x i64>
- ret <2 x i64> %cvt
-}
-
-define <4 x i32> @fptosi_2f64_to_2i32_const() {
-; SSE-LABEL: fptosi_2f64_to_2i32_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = <4294967295,1,u,u>
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_2f64_to_2i32_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <4294967295,1,u,u>
-; AVX-NEXT: retq
- %cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32>
- %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- ret <4 x i32> %ext
-}
-
-define <4 x i64> @fptosi_4f64_to_4i64_const() {
-; SSE-LABEL: fptosi_4f64_to_4i64_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,18446744073709551613]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_4f64_to_4i64_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,18446744073709551613]
-; AVX-NEXT: retq
- %cvt = fptosi <4 x double> <double 1.0, double -1.0, double 2.0, double -3.0> to <4 x i64>
- ret <4 x i64> %cvt
-}
-
-define <4 x i32> @fptosi_4f64_to_4i32_const() {
-; SSE-LABEL: fptosi_4f64_to_4i32_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_4f64_to_4i32_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [4294967295,1,4294967294,3]
-; AVX-NEXT: retq
- %cvt = fptosi <4 x double> <double -1.0, double 1.0, double -2.0, double 3.0> to <4 x i32>
- ret <4 x i32> %cvt
-}
-
-define <2 x i64> @fptoui_2f64_to_2i64_const() {
-; SSE-LABEL: fptoui_2f64_to_2i64_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptoui_2f64_to_2i64_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4]
-; AVX-NEXT: retq
- %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i64>
- ret <2 x i64> %cvt
-}
-
-define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
-; SSE-LABEL: fptoui_2f64_to_2i32_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = <2,4,u,u>
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptoui_2f64_to_2i32_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = <2,4,u,u>
-; AVX-NEXT: retq
- %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32>
- %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- ret <4 x i32> %ext
-}
-
-define <4 x i64> @fptoui_4f64_to_4i64_const(<4 x double> %a) {
-; SSE-LABEL: fptoui_4f64_to_4i64_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4]
-; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,8]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptoui_4f64_to_4i64_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [2,4,6,8]
-; AVX-NEXT: retq
- %cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i64>
- ret <4 x i64> %cvt
-}
-
-define <4 x i32> @fptoui_4f64_to_4i32_const(<4 x double> %a) {
-; SSE-LABEL: fptoui_4f64_to_4i32_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [2,4,6,8]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptoui_4f64_to_4i32_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [2,4,6,8]
-; AVX-NEXT: retq
- %cvt = fptoui <4 x double> <double 2.0, double 4.0, double 6.0, double 8.0> to <4 x i32>
- ret <4 x i32> %cvt
-}
-
-define <4 x i32> @fptosi_4f32_to_4i32_const() {
-; SSE-LABEL: fptosi_4f32_to_4i32_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_4f32_to_4i32_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,4294967295,2,3]
-; AVX-NEXT: retq
- %cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i32>
- ret <4 x i32> %cvt
-}
-
-define <4 x i64> @fptosi_4f32_to_4i64_const() {
-; SSE-LABEL: fptosi_4f32_to_4i64_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,18446744073709551615]
-; SSE-NEXT: movaps {{.*#+}} xmm1 = [2,3]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_4f32_to_4i64_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,18446744073709551615,2,3]
-; AVX-NEXT: retq
- %cvt = fptosi <4 x float> <float 1.0, float -1.0, float 2.0, float 3.0> to <4 x i64>
- ret <4 x i64> %cvt
-}
-
-define <8 x i32> @fptosi_8f32_to_8i32_const(<8 x float> %a) {
-; SSE-LABEL: fptosi_8f32_to_8i32_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,4294967295,2,3]
-; SSE-NEXT: movaps {{.*#+}} xmm1 = [6,4294967288,2,4294967295]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_8f32_to_8i32_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,4294967295,2,3,6,4294967288,2,4294967295]
-; AVX-NEXT: retq
- %cvt = fptosi <8 x float> <float 1.0, float -1.0, float 2.0, float 3.0, float 6.0, float -8.0, float 2.0, float -1.0> to <8 x i32>
- ret <8 x i32> %cvt
-}
-
-define <4 x i32> @fptoui_4f32_to_4i32_const(<4 x float> %a) {
-; SSE-LABEL: fptoui_4f32_to_4i32_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptoui_4f32_to_4i32_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [1,2,4,6]
-; AVX-NEXT: retq
- %cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 6.0> to <4 x i32>
- ret <4 x i32> %cvt
-}
-
-define <4 x i64> @fptoui_4f32_to_4i64_const() {
-; SSE-LABEL: fptoui_4f32_to_4i64_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2]
-; SSE-NEXT: movaps {{.*#+}} xmm1 = [4,8]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptoui_4f32_to_4i64_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8]
-; AVX-NEXT: retq
- %cvt = fptoui <4 x float> <float 1.0, float 2.0, float 4.0, float 8.0> to <4 x i64>
- ret <4 x i64> %cvt
-}
-
-define <8 x i32> @fptoui_8f32_to_8i32_const(<8 x float> %a) {
-; SSE-LABEL: fptoui_8f32_to_8i32_const:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps {{.*#+}} xmm0 = [1,2,4,6]
-; SSE-NEXT: movaps {{.*#+}} xmm1 = [8,6,4,1]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptoui_8f32_to_8i32_const:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,6,8,6,4,1]
-; AVX-NEXT: retq
- %cvt = fptoui <8 x float> <float 1.0, float 2.0, float 4.0, float 6.0, float 8.0, float 6.0, float 4.0, float 1.0> to <8 x i32>
- ret <8 x i32> %cvt
-}
-
-;
-; Special Cases
-;
-
-define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
-; SSE-LABEL: fptosi_2f16_to_4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rax
-; SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: callq __gnu_f2h_ieee
-; SSE-NEXT: movzwl %ax, %edi
-; SSE-NEXT: callq __gnu_h2f_ieee
-; SSE-NEXT: movss %xmm0, (%rsp) # 4-byte Spill
-; SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: callq __gnu_f2h_ieee
-; SSE-NEXT: movzwl %ax, %edi
-; SSE-NEXT: callq __gnu_h2f_ieee
-; SSE-NEXT: cvttss2si %xmm0, %eax
-; SSE-NEXT: cvttss2si (%rsp), %ecx # 4-byte Folded Reload
-; SSE-NEXT: movd %ecx, %xmm0
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE-NEXT: popq %rax
-; SSE-NEXT: retq
-;
-; VEX-LABEL: fptosi_2f16_to_4i32:
-; VEX: # %bb.0:
-; VEX-NEXT: pushq %rax
-; VEX-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; VEX-NEXT: vmovaps %xmm1, %xmm0
-; VEX-NEXT: callq __gnu_f2h_ieee
-; VEX-NEXT: movzwl %ax, %edi
-; VEX-NEXT: callq __gnu_h2f_ieee
-; VEX-NEXT: vmovss %xmm0, (%rsp) # 4-byte Spill
-; VEX-NEXT: vmovss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
-; VEX-NEXT: # xmm0 = mem[0],zero,zero,zero
-; VEX-NEXT: callq __gnu_f2h_ieee
-; VEX-NEXT: movzwl %ax, %edi
-; VEX-NEXT: callq __gnu_h2f_ieee
-; VEX-NEXT: vcvttss2si %xmm0, %eax
-; VEX-NEXT: vcvttss2si (%rsp), %ecx # 4-byte Folded Reload
-; VEX-NEXT: vmovd %ecx, %xmm0
-; VEX-NEXT: vmovd %eax, %xmm1
-; VEX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; VEX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; VEX-NEXT: popq %rax
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: fptosi_2f16_to_4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1
-; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0
-; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX512-NEXT: vcvttss2si %xmm0, %eax
-; AVX512-NEXT: vcvttss2si %xmm1, %ecx
-; AVX512-NEXT: vmovd %ecx, %xmm0
-; AVX512-NEXT: vmovd %eax, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512-NEXT: retq
- %cvt = fptosi <2 x half> %a to <2 x i32>
- %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i32> %ext
-}
-
-define <4 x i32> @fptosi_2f80_to_4i32(<2 x x86_fp80> %a) nounwind {
-; SSE-LABEL: fptosi_2f80_to_4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: fldt {{[0-9]+}}(%rsp)
-; SSE-NEXT: fldt {{[0-9]+}}(%rsp)
-; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: orl $3072, %eax # imm = 0xC00
-; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp)
-; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: fnstcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movzwl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT: orl $3072, %eax # imm = 0xC00
-; SSE-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: fistpl -{{[0-9]+}}(%rsp)
-; SSE-NEXT: fldcw -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_2f80_to_4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
-; AVX-NEXT: fldt {{[0-9]+}}(%rsp)
-; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp)
-; AVX-NEXT: fisttpl -{{[0-9]+}}(%rsp)
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX-NEXT: retq
- %cvt = fptosi <2 x x86_fp80> %a to <2 x i32>
- %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i32> %ext
-}
-
-define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind {
-; SSE-LABEL: fptosi_2f128_to_4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq %rcx, %r14
-; SSE-NEXT: movq %rdx, %rbx
-; SSE-NEXT: callq __fixtfsi
-; SSE-NEXT: movl %eax, %ebp
-; SSE-NEXT: movq %rbx, %rdi
-; SSE-NEXT: movq %r14, %rsi
-; SSE-NEXT: callq __fixtfsi
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: movd %ebp, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %rbp
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_2f128_to_4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: pushq %rbp
-; AVX-NEXT: pushq %r14
-; AVX-NEXT: pushq %rbx
-; AVX-NEXT: movq %rcx, %r14
-; AVX-NEXT: movq %rdx, %rbx
-; AVX-NEXT: callq __fixtfsi
-; AVX-NEXT: movl %eax, %ebp
-; AVX-NEXT: movq %rbx, %rdi
-; AVX-NEXT: movq %r14, %rsi
-; AVX-NEXT: callq __fixtfsi
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vmovd %ebp, %xmm1
-; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX-NEXT: popq %rbx
-; AVX-NEXT: popq %r14
-; AVX-NEXT: popq %rbp
-; AVX-NEXT: retq
- %cvt = fptosi <2 x fp128> %a to <2 x i32>
- %ext = shufflevector <2 x i32> %cvt, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i32> %ext
-}
-
-define <2 x i8> @fptosi_2f32_to_2i8(<2 x float> %a) {
-; SSE-LABEL: fptosi_2f32_to_2i8:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_2f32_to_2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
- %cvt = fptosi <2 x float> %a to <2 x i8>
- ret <2 x i8> %cvt
-}
-
-define <2 x i16> @fptosi_2f32_to_2i16(<2 x float> %a) {
-; SSE-LABEL: fptosi_2f32_to_2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_2f32_to_2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: retq
- %cvt = fptosi <2 x float> %a to <2 x i16>
- ret <2 x i16> %cvt
-}
-
-define <2 x i8> @fptoui_2f32_to_2i8(<2 x float> %a) {
-; SSE-LABEL: fptoui_2f32_to_2i8:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptoui_2f32_to_2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
- %cvt = fptoui <2 x float> %a to <2 x i8>
- ret <2 x i8> %cvt
-}
-
-define <2 x i16> @fptoui_2f32_to_2i16(<2 x float> %a) {
-; SSE-LABEL: fptoui_2f32_to_2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptoui_2f32_to_2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: retq
- %cvt = fptoui <2 x float> %a to <2 x i16>
- ret <2 x i16> %cvt
-}
-
-define <2 x i8> @fptosi_2f64_to_2i8(<2 x double> %a) {
-; SSE-LABEL: fptosi_2f64_to_2i8:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: andpd {{.*}}(%rip), %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_2f64_to_2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
- %cvt = fptosi <2 x double> %a to <2 x i8>
- ret <2 x i8> %cvt
-}
-
-define <2 x i16> @fptosi_2f64_to_2i16(<2 x double> %a) {
-; SSE-LABEL: fptosi_2f64_to_2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptosi_2f64_to_2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: retq
- %cvt = fptosi <2 x double> %a to <2 x i16>
- ret <2 x i16> %cvt
-}
-
-define <2 x i8> @fptoui_2f64_to_2i8(<2 x double> %a) {
-; SSE-LABEL: fptoui_2f64_to_2i8:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: andpd {{.*}}(%rip), %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptoui_2f64_to_2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX-NEXT: retq
- %cvt = fptoui <2 x double> %a to <2 x i8>
- ret <2 x i8> %cvt
-}
-
-define <2 x i16> @fptoui_2f64_to_2i16(<2 x double> %a) {
-; SSE-LABEL: fptoui_2f64_to_2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: fptoui_2f64_to_2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0
-; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX-NEXT: retq
- %cvt = fptoui <2 x double> %a to <2 x i16>
- ret <2 x i16> %cvt
-}
-
-define <8 x i16> @fptosi_8f64_to_8i16(<8 x double> %a) {
-; SSE-LABEL: fptosi_8f64_to_8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttpd2dq %xmm3, %xmm3
-; SSE-NEXT: cvttpd2dq %xmm2, %xmm2
-; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT: cvttpd2dq %xmm1, %xmm1
-; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: packssdw %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; VEX-LABEL: fptosi_8f64_to_8i16:
-; VEX: # %bb.0:
-; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1
-; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0
-; VEX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; VEX-NEXT: vzeroupper
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_8f64_to_8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_8f64_to_8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_8f64_to_8i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_8f64_to_8i16:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0
-; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VLDQ-NEXT: vzeroupper
-; AVX512VLDQ-NEXT: retq
- %cvt = fptosi <8 x double> %a to <8 x i16>
- ret <8 x i16> %cvt
-}
-
-define <8 x i16> @fptoui_8f64_to_8i16(<8 x double> %a) {
-; SSE-LABEL: fptoui_8f64_to_8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttpd2dq %xmm3, %xmm3
-; SSE-NEXT: cvttpd2dq %xmm2, %xmm2
-; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: cvttpd2dq %xmm1, %xmm1
-; SSE-NEXT: cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE-NEXT: retq
-;
-; VEX-LABEL: fptoui_8f64_to_8i16:
-; VEX: # %bb.0:
-; VEX-NEXT: vcvttpd2dq %ymm1, %xmm1
-; VEX-NEXT: vcvttpd2dq %ymm0, %xmm0
-; VEX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; VEX-NEXT: vzeroupper
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_8f64_to_8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vcvttpd2dq %zmm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_8f64_to_8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvttpd2dq %zmm0, %ymm0
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_8f64_to_8i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vcvttpd2dq %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_8f64_to_8i16:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttpd2dq %zmm0, %ymm0
-; AVX512VLDQ-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VLDQ-NEXT: vzeroupper
-; AVX512VLDQ-NEXT: retq
- %cvt = fptoui <8 x double> %a to <8 x i16>
- ret <8 x i16> %cvt
-}
-
-define <16 x i8> @fptosi_16f32_to_16i8(<16 x float> %a) {
-; SSE-LABEL: fptosi_16f32_to_16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE-NEXT: packssdw %xmm3, %xmm2
-; SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packsswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptosi_16f32_to_16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptosi_16f32_to_16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: fptosi_16f32_to_16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %cvt = fptosi <16 x float> %a to <16 x i8>
- ret <16 x i8> %cvt
-}
-
-define <16 x i8> @fptoui_16f32_to_16i8(<16 x float> %a) {
-; SSE-LABEL: fptoui_16f32_to_16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: cvttps2dq %xmm3, %xmm3
-; SSE-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE-NEXT: packssdw %xmm3, %xmm2
-; SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE-NEXT: cvttps2dq %xmm0, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: fptoui_16f32_to_16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vcvttps2dq %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: fptoui_16f32_to_16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vcvttps2dq %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vcvttps2dq %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: fptoui_16f32_to_16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vcvttps2dq %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %cvt = fptoui <16 x float> %a to <16 x i8>
- ret <16 x i8> %cvt
-}
-
-define <2 x i64> @fptosi_2f32_to_2i64_load(<2 x float>* %x) {
-; SSE-LABEL: fptosi_2f32_to_2i64_load:
-; SSE: # %bb.0:
-; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movq %rax, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: retq
-;
-; VEX-LABEL: fptosi_2f32_to_2i64_load:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; VEX-NEXT: vcvttss2si %xmm0, %rax
-; VEX-NEXT: vmovq %rax, %xmm1
-; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; VEX-NEXT: vcvttss2si %xmm0, %rax
-; VEX-NEXT: vmovq %rax, %xmm0
-; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptosi_2f32_to_2i64_load:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vcvttss2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvttss2si %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptosi_2f32_to_2i64_load:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvttss2si %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptosi_2f32_to_2i64_load:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT: vcvttps2qq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptosi_2f32_to_2i64_load:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2qq (%rdi), %xmm0
-; AVX512VLDQ-NEXT: retq
- %a = load <2 x float>, <2 x float>* %x
- %b = fptosi <2 x float> %a to <2 x i64>
- ret <2 x i64> %b
-}
-
-define <2 x i64> @fptoui_2f32_to_2i64_load(<2 x float>* %x) {
-; SSE-LABEL: fptoui_2f32_to_2i64_load:
-; SSE: # %bb.0:
-; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: subss %xmm2, %xmm0
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttss2si %xmm1, %rdx
-; SSE-NEXT: ucomiss %xmm2, %xmm1
-; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movq %rdx, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE-NEXT: movaps %xmm1, %xmm3
-; SSE-NEXT: subss %xmm2, %xmm3
-; SSE-NEXT: cvttss2si %xmm3, %rax
-; SSE-NEXT: xorq %rcx, %rax
-; SSE-NEXT: cvttss2si %xmm1, %rcx
-; SSE-NEXT: ucomiss %xmm2, %xmm1
-; SSE-NEXT: cmovaeq %rax, %rcx
-; SSE-NEXT: movq %rcx, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE-NEXT: retq
-;
-; VEX-LABEL: fptoui_2f32_to_2i64_load:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; VEX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm2
-; VEX-NEXT: vcvttss2si %xmm2, %rax
-; VEX-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttss2si %xmm0, %rdx
-; VEX-NEXT: vucomiss %xmm1, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rdx
-; VEX-NEXT: vmovq %rdx, %xmm2
-; VEX-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; VEX-NEXT: vsubss %xmm1, %xmm0, %xmm3
-; VEX-NEXT: vcvttss2si %xmm3, %rax
-; VEX-NEXT: xorq %rcx, %rax
-; VEX-NEXT: vcvttss2si %xmm0, %rcx
-; VEX-NEXT: vucomiss %xmm1, %xmm0
-; VEX-NEXT: cmovaeq %rax, %rcx
-; VEX-NEXT: vmovq %rcx, %xmm0
-; VEX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: fptoui_2f32_to_2i64_load:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512F-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512F-NEXT: vmovq %rax, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: fptoui_2f32_to_2i64_load:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm1
-; AVX512VL-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; AVX512VL-NEXT: vcvttss2usi %xmm0, %rax
-; AVX512VL-NEXT: vmovq %rax, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: fptoui_2f32_to_2i64_load:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT: vcvttps2uqq %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: fptoui_2f32_to_2i64_load:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvttps2uqq (%rdi), %xmm0
-; AVX512VLDQ-NEXT: retq
- %a = load <2 x float>, <2 x float>* %x
- %b = fptoui <2 x float> %a to <2 x i64>
- ret <2 x i64> %b
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX1
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX2
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VLDQ
-;
-; 32-bit tests to make sure we're not doing anything stupid.
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2
-; RUN: llc < %s -disable-peephole -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1
-
-;
-; Signed Integer to Double
-;
-
-define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
-; SSE2-LABEL: sitofp_2i64_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: cvtsi2sd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2sd %rax, %xmm0
-; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_2i64_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: cvtsi2sd %rax, %xmm1
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2sd %rax, %xmm0
-; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: sitofp_2i64_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0
-; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: sitofp_2i64_to_2f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: sitofp_2i64_to_2f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: sitofp_2i64_to_2f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtqq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = sitofp <2 x i64> %a to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_2i32_to_2f64(<4 x i32> %a) {
-; SSE-LABEL: sitofp_2i32_to_2f64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_2i32_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
- %cvt = sitofp <2 x i32> %shuf to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
-; SSE-LABEL: sitofp_4i32_to_2f64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_4i32_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %cvt = sitofp <4 x i32> %a to <4 x double>
- %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
- ret <2 x double> %shuf
-}
-
-define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_2i16_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_2i16_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sitofp_2i16_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
- %cvt = sitofp <2 x i16> %shuf to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_8i16_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_8i16_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: sitofp_8i16_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpmovsxwd %xmm0, %xmm0
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: sitofp_8i16_to_2f64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %cvt = sitofp <8 x i16> %a to <8 x double>
- %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
- ret <2 x double> %shuf
-}
-
-define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_2i8_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_2i8_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sitofp_2i8_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
- %cvt = sitofp <2 x i8> %shuf to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_16i8_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_16i8_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: sitofp_16i8_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpmovsxbd %xmm0, %xmm0
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: sitofp_16i8_to_2f64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %cvt = sitofp <16 x i8> %a to <16 x double>
- %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
- ret <2 x double> %shuf
-}
-
-define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) {
-; SSE2-LABEL: sitofp_4i64_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: cvtsi2sd %rax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2sd %rax, %xmm0
-; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: cvtsi2sd %rax, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2sd %rax, %xmm0
-; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: movaps %xmm3, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_4i64_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: cvtsi2sd %rax, %xmm2
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2sd %rax, %xmm0
-; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2sd %rax, %xmm2
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2sd %rax, %xmm1
-; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sitofp_4i64_to_4f64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpextrq $1, %xmm1, %rax
-; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
-; AVX1-NEXT: vmovq %xmm1, %rax
-; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sitofp_4i64_to_4f64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm1, %rax
-; AVX2-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
-; AVX2-NEXT: vmovq %xmm1, %rax
-; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX2-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sitofp_4i64_to_4f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
-; AVX512F-NEXT: vmovq %xmm1, %rax
-; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: sitofp_4i64_to_4f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovq %xmm1, %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: sitofp_4i64_to_4f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: sitofp_4i64_to_4f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtqq2pd %ymm0, %ymm0
-; AVX512VLDQ-NEXT: retq
- %cvt = sitofp <4 x i64> %a to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) {
-; SSE-LABEL: sitofp_4i32_to_4f64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2pd %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE-NEXT: cvtdq2pd %xmm0, %xmm1
-; SSE-NEXT: movaps %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_4i32_to_4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT: retq
- %cvt = sitofp <4 x i32> %a to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_4i16_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_4i16_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sitofp_4i16_to_4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT: retq
- %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %cvt = sitofp <4 x i16> %shuf to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_8i16_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_8i16_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: sitofp_8i16_to_4f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpmovsxwd %xmm0, %xmm0
-; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: sitofp_8i16_to_4f64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT: retq
- %cvt = sitofp <8 x i16> %a to <8 x double>
- %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x double> %shuf
-}
-
-define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_4i8_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_4i8_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sitofp_4i8_to_4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT: retq
- %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %cvt = sitofp <4 x i8> %shuf to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_16i8_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_16i8_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: sitofp_16i8_to_4f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpmovsxbd %xmm0, %xmm0
-; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: sitofp_16i8_to_4f64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT: retq
- %cvt = sitofp <16 x i8> %a to <16 x double>
- %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x double> %shuf
-}
-
-;
-; Unsigned Integer to Double
-;
-
-define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
-; SSE2-LABEL: uitofp_2i64_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: por {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psrlq $32, %xmm0
-; SSE2-NEXT: por {{.*}}(%rip), %xmm0
-; SSE2-NEXT: subpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_2i64_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: por {{.*}}(%rip), %xmm1
-; SSE41-NEXT: psrlq $32, %xmm0
-; SSE41-NEXT: por {{.*}}(%rip), %xmm0
-; SSE41-NEXT: subpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_2i64_to_2f64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_2i64_to_2f64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_2i64_to_2f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_2i64_to_2f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_2i64_to_2f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtuqq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = uitofp <2 x i64> %a to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
-; SSE2-LABEL: uitofp_2i32_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_2i32_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_2i32_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
-; VEX-NEXT: vpsrld $16, %xmm0, %xmm0
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_2i32_to_2f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_2i32_to_2f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_2i32_to_2f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %shuf = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
- %cvt = uitofp <2 x i32> %shuf to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
-; SSE2-LABEL: uitofp_4i32_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_4i32_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_4i32_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; VEX-NEXT: vpsrld $16, %xmm0, %xmm0
-; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_4i32_to_2f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_4i32_to_2f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_4i32_to_2f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = uitofp <4 x i32> %a to <4 x double>
- %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
- ret <2 x double> %shuf
-}
-
-define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_2i16_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_2i16_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: uitofp_2i16_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
- %cvt = uitofp <2 x i16> %shuf to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_8i16_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_8i16_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_8i16_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: uitofp_8i16_to_2f64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %cvt = uitofp <8 x i16> %a to <8 x double>
- %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
- ret <2 x double> %shuf
-}
-
-define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_2i8_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_2i8_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: uitofp_2i8_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
- %cvt = uitofp <2 x i8> %shuf to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_16i8_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_16i8_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_16i8_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: uitofp_16i8_to_2f64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %cvt = uitofp <16 x i8> %a to <16 x double>
- %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
- ret <2 x double> %shuf
-}
-
-define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) {
-; SSE2-LABEL: uitofp_4i64_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: psrlq $32, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
-; SSE2-NEXT: subpd %xmm6, %xmm0
-; SSE2-NEXT: addpd %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: psrlq $32, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: subpd %xmm6, %xmm1
-; SSE2-NEXT: addpd %xmm2, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_4i64_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
-; SSE41-NEXT: por %xmm4, %xmm3
-; SSE41-NEXT: psrlq $32, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
-; SSE41-NEXT: subpd %xmm6, %xmm0
-; SSE41-NEXT: addpd %xmm3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: por %xmm4, %xmm2
-; SSE41-NEXT: psrlq $32, %xmm1
-; SSE41-NEXT: por %xmm5, %xmm1
-; SSE41-NEXT: subpd %xmm6, %xmm1
-; SSE41-NEXT: addpd %xmm2, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_4i64_to_4f64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_4i64_to_4f64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_4i64_to_4f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_4i64_to_4f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1
-; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_4i64_to_4f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_4i64_to_4f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtuqq2pd %ymm0, %ymm0
-; AVX512VLDQ-NEXT: retq
- %cvt = uitofp <4 x i64> %a to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) {
-; SSE2-LABEL: uitofp_4i32_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
-; SSE2-NEXT: mulpd %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm5
-; SSE2-NEXT: mulpd %xmm2, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: cvtdq2pd %xmm4, %xmm1
-; SSE2-NEXT: addpd %xmm5, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_4i32_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
-; SSE41-NEXT: mulpd %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm5
-; SSE41-NEXT: mulpd %xmm2, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm4, %xmm1
-; SSE41-NEXT: addpd %xmm5, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_4i32_to_4f64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_4i32_to_4f64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
-; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_4i32_to_4f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_4i32_to_4f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtudq2pd %xmm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_4i32_to_4f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_4i32_to_4f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %ymm0
-; AVX512VLDQ-NEXT: retq
- %cvt = uitofp <4 x i32> %a to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_4i16_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_4i16_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: uitofp_4i16_to_4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT: retq
- %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %cvt = uitofp <4 x i16> %shuf to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_8i16_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_8i16_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_8i16_to_4f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: uitofp_8i16_to_4f64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT: retq
- %cvt = uitofp <8 x i16> %a to <8 x double>
- %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x double> %shuf
-}
-
-define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_4i8_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_4i8_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: uitofp_4i8_to_4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT: retq
- %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %cvt = uitofp <4 x i8> %shuf to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_16i8_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_16i8_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_16i8_to_4f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; VEX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; VEX-NEXT: retq
-;
-; AVX512-LABEL: uitofp_16i8_to_4f64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vcvtdq2pd %ymm0, %zmm0
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT: retq
- %cvt = uitofp <16 x i8> %a to <16 x double>
- %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x double> %shuf
-}
-
-;
-; Signed Integer to Float
-;
-
-define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
-; SSE2-LABEL: sitofp_2i64_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_2i64_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: sitofp_2i64_to_4f32:
-; VEX: # %bb.0:
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: sitofp_2i64_to_4f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: sitofp_2i64_to_4f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: sitofp_2i64_to_4f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = sitofp <2 x i64> %a to <2 x float>
- %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- ret <4 x float> %ext
-}
-
-define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
-; SSE2-LABEL: sitofp_2i64_to_4f32_zero:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_2i64_to_4f32_zero:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: sitofp_2i64_to_4f32_zero:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: sitofp_2i64_to_4f32_zero:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: sitofp_2i64_to_4f32_zero:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtqq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = sitofp <2 x i64> %a to <2 x float>
- %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x float> %ext
-}
-
-define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
-; SSE2-LABEL: sitofp_4i64_to_4f32_undef:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_4i64_to_4f32_undef:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: sitofp_4i64_to_4f32_undef:
-; VEX: # %bb.0:
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: sitofp_4i64_to_4f32_undef:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: sitofp_4i64_to_4f32_undef:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
-; AVX512VLDQ-NEXT: vzeroupper
-; AVX512VLDQ-NEXT: retq
- %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- %cvt = sitofp <4 x i64> %ext to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) {
-; SSE-LABEL: sitofp_4i32_to_4f32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_4i32_to_4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT: retq
- %cvt = sitofp <4 x i32> %a to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_4i16_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_4i16_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sitofp_4i16_to_4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT: retq
- %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %cvt = sitofp <4 x i16> %shuf to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_8i16_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_8i16_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sitofp_8i16_to_4f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sitofp_8i16_to_4f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sitofp_8i16_to_4f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %cvt = sitofp <8 x i16> %a to <8 x float>
- %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x float> %shuf
-}
-
-define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_4i8_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_4i8_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sitofp_4i8_to_4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT: retq
- %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %cvt = sitofp <4 x i8> %shuf to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_16i8_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_16i8_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sitofp_16i8_to_4f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sitofp_16i8_to_4f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sitofp_16i8_to_4f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %cvt = sitofp <16 x i8> %a to <16 x float>
- %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x float> %shuf
-}
-
-define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
-; SSE2-LABEL: sitofp_4i64_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_4i64_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sitofp_4i64_to_4f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sitofp_4i64_to_4f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sitofp_4i64_to_4f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: sitofp_4i64_to_4f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: sitofp_4i64_to_4f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtqq2ps %ymm0, %xmm0
-; AVX512VLDQ-NEXT: vzeroupper
-; AVX512VLDQ-NEXT: retq
- %cvt = sitofp <4 x i64> %a to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) {
-; SSE-LABEL: sitofp_8i32_to_8f32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_8i32_to_8f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT: retq
- %cvt = sitofp <8 x i32> %a to <8 x float>
- ret <8 x float> %cvt
-}
-
-define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) {
-; SSE2-LABEL: sitofp_8i16_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_8i16_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
-; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sitofp_8i16_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sitofp_8i16_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sitofp_8i16_to_8f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: retq
- %cvt = sitofp <8 x i16> %a to <8 x float>
- ret <8 x float> %cvt
-}
-
-define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_8i8_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_8i8_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm1
-; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sitofp_8i8_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sitofp_8i8_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sitofp_8i8_to_8f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0
-; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: retq
- %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %cvt = sitofp <8 x i8> %shuf to <8 x float>
- ret <8 x float> %cvt
-}
-
-define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) {
-; SSE2-LABEL: sitofp_16i8_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_16i8_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm1
-; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sitofp_16i8_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sitofp_16i8_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sitofp_16i8_to_8f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT: retq
- %cvt = sitofp <16 x i8> %a to <16 x float>
- %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x float> %shuf
-}
-
-;
-; Unsigned Integer to Float
-;
-
-define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
-; SSE2-LABEL: uitofp_2i64_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB39_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: jmp .LBB39_3
-; SSE2-NEXT: .LBB39_1:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: addss %xmm0, %xmm0
-; SSE2-NEXT: .LBB39_3:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB39_4
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-; SSE2-NEXT: .LBB39_4:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: addss %xmm1, %xmm1
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_2i64_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB39_1
-; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: jmp .LBB39_3
-; SSE41-NEXT: .LBB39_1:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: addss %xmm1, %xmm1
-; SSE41-NEXT: .LBB39_3:
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB39_4
-; SSE41-NEXT: # %bb.5:
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE41-NEXT: retq
-; SSE41-NEXT: .LBB39_4:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: addss %xmm0, %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_2i64_to_4f32:
-; VEX: # %bb.0:
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB39_1
-; VEX-NEXT: # %bb.2:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT: jmp .LBB39_3
-; VEX-NEXT: .LBB39_1:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
-; VEX-NEXT: .LBB39_3:
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB39_4
-; VEX-NEXT: # %bb.5:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; VEX-NEXT: retq
-; VEX-NEXT: .LBB39_4:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_2i64_to_4f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_2i64_to_4f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_2i64_to_4f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtuqq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = uitofp <2 x i64> %a to <2 x float>
- %ext = shufflevector <2 x float> %cvt, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- ret <4 x float> %ext
-}
-
-define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
-; SSE2-LABEL: uitofp_2i64_to_2f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB40_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: jmp .LBB40_3
-; SSE2-NEXT: .LBB40_1:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: addss %xmm1, %xmm1
-; SSE2-NEXT: .LBB40_3:
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB40_4
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: jmp .LBB40_6
-; SSE2-NEXT: .LBB40_4:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: addss %xmm0, %xmm0
-; SSE2-NEXT: .LBB40_6:
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_2i64_to_2f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB40_1
-; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: jmp .LBB40_3
-; SSE41-NEXT: .LBB40_1:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: addss %xmm0, %xmm0
-; SSE41-NEXT: .LBB40_3:
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB40_4
-; SSE41-NEXT: # %bb.5:
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; SSE41-NEXT: retq
-; SSE41-NEXT: .LBB40_4:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: addss %xmm1, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_2i64_to_2f32:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB40_1
-; VEX-NEXT: # %bb.2:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT: jmp .LBB40_3
-; VEX-NEXT: .LBB40_1:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
-; VEX-NEXT: .LBB40_3:
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB40_4
-; VEX-NEXT: # %bb.5:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; VEX-NEXT: retq
-; VEX-NEXT: .LBB40_4:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_2i64_to_2f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_2i64_to_2f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_2i64_to_2f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtuqq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = uitofp <2 x i64> %a to <2 x float>
- %ext = shufflevector <2 x float> %cvt, <2 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x float> %ext
-}
-
-define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
-; SSE2-LABEL: uitofp_4i64_to_4f32_undef:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB41_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: jmp .LBB41_3
-; SSE2-NEXT: .LBB41_1:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: addss %xmm1, %xmm1
-; SSE2-NEXT: .LBB41_3:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB41_4
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: jmp .LBB41_6
-; SSE2-NEXT: .LBB41_4:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: addss %xmm0, %xmm0
-; SSE2-NEXT: .LBB41_6:
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm1[0],zero
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_4i64_to_4f32_undef:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB41_1
-; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: jmp .LBB41_3
-; SSE41-NEXT: .LBB41_1:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: addss %xmm1, %xmm1
-; SSE41-NEXT: .LBB41_3:
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB41_4
-; SSE41-NEXT: # %bb.5:
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; SSE41-NEXT: retq
-; SSE41-NEXT: .LBB41_4:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: addss %xmm0, %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_4i64_to_4f32_undef:
-; VEX: # %bb.0:
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB41_1
-; VEX-NEXT: # %bb.2:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT: jmp .LBB41_3
-; VEX-NEXT: .LBB41_1:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
-; VEX-NEXT: .LBB41_3:
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB41_4
-; VEX-NEXT: # %bb.5:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; VEX-NEXT: retq
-; VEX-NEXT: .LBB41_4:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0
-; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_4i64_to_4f32_undef:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_4i64_to_4f32_undef:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
-; AVX512VLDQ-NEXT: vzeroupper
-; AVX512VLDQ-NEXT: retq
- %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
- %cvt = uitofp <4 x i64> %ext to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
-; SSE2-LABEL: uitofp_4i32_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: por {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: por {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addps {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addps %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_4i32_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; SSE41-NEXT: addps {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addps %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_4i32_to_4f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_4i32_to_4f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_4i32_to_4f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_4i32_to_4f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_4i32_to_4f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %cvt = uitofp <4 x i32> %a to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_4i16_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_4i16_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: uitofp_4i16_to_4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT: retq
- %shuf = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %cvt = uitofp <4 x i16> %shuf to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_8i16_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_8i16_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_8i16_to_4f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_8i16_to_4f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: uitofp_8i16_to_4f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %cvt = uitofp <8 x i16> %a to <8 x float>
- %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x float> %shuf
-}
-
-define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_4i8_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_4i8_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: uitofp_4i8_to_4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT: retq
- %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %cvt = uitofp <4 x i8> %shuf to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_16i8_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_16i8_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_16i8_to_4f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_16i8_to_4f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: uitofp_16i8_to_4f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %cvt = uitofp <16 x i8> %a to <16 x float>
- %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x float> %shuf
-}
-
-define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
-; SSE2-LABEL: uitofp_4i64_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB47_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm2
-; SSE2-NEXT: jmp .LBB47_3
-; SSE2-NEXT: .LBB47_1:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm2
-; SSE2-NEXT: addss %xmm2, %xmm2
-; SSE2-NEXT: .LBB47_3:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB47_4
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm3
-; SSE2-NEXT: jmp .LBB47_6
-; SSE2-NEXT: .LBB47_4:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm3
-; SSE2-NEXT: addss %xmm3, %xmm3
-; SSE2-NEXT: .LBB47_6:
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB47_7
-; SSE2-NEXT: # %bb.8:
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: jmp .LBB47_9
-; SSE2-NEXT: .LBB47_7:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: addss %xmm1, %xmm1
-; SSE2-NEXT: .LBB47_9:
-; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB47_10
-; SSE2-NEXT: # %bb.11:
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: jmp .LBB47_12
-; SSE2-NEXT: .LBB47_10:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: addss %xmm0, %xmm0
-; SSE2-NEXT: .LBB47_12:
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_4i64_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB47_1
-; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: jmp .LBB47_3
-; SSE41-NEXT: .LBB47_1:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: addss %xmm2, %xmm2
-; SSE41-NEXT: .LBB47_3:
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB47_4
-; SSE41-NEXT: # %bb.5:
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: jmp .LBB47_6
-; SSE41-NEXT: .LBB47_4:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: addss %xmm0, %xmm0
-; SSE41-NEXT: .LBB47_6:
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB47_7
-; SSE41-NEXT: # %bb.8:
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: jmp .LBB47_9
-; SSE41-NEXT: .LBB47_7:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: addss %xmm2, %xmm2
-; SSE41-NEXT: .LBB47_9:
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB47_10
-; SSE41-NEXT: # %bb.11:
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT: retq
-; SSE41-NEXT: .LBB47_10:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: addss %xmm1, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_4i64_to_4f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: testq %rax, %rax
-; AVX1-NEXT: js .LBB47_1
-; AVX1-NEXT: # %bb.2:
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX1-NEXT: jmp .LBB47_3
-; AVX1-NEXT: .LBB47_1:
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq %rcx
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: .LBB47_3:
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: testq %rax, %rax
-; AVX1-NEXT: js .LBB47_4
-; AVX1-NEXT: # %bb.5:
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX1-NEXT: jmp .LBB47_6
-; AVX1-NEXT: .LBB47_4:
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq %rcx
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: .LBB47_6:
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: testq %rax, %rax
-; AVX1-NEXT: js .LBB47_7
-; AVX1-NEXT: # %bb.8:
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX1-NEXT: jmp .LBB47_9
-; AVX1-NEXT: .LBB47_7:
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq %rcx
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: .LBB47_9:
-; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX1-NEXT: vpextrq $1, %xmm0, %rax
-; AVX1-NEXT: testq %rax, %rax
-; AVX1-NEXT: js .LBB47_10
-; AVX1-NEXT: # %bb.11:
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-; AVX1-NEXT: .LBB47_10:
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq %rcx
-; AVX1-NEXT: andl $1, %eax
-; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_4i64_to_4f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: testq %rax, %rax
-; AVX2-NEXT: js .LBB47_1
-; AVX2-NEXT: # %bb.2:
-; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX2-NEXT: jmp .LBB47_3
-; AVX2-NEXT: .LBB47_1:
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: .LBB47_3:
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: testq %rax, %rax
-; AVX2-NEXT: js .LBB47_4
-; AVX2-NEXT: # %bb.5:
-; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX2-NEXT: jmp .LBB47_6
-; AVX2-NEXT: .LBB47_4:
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: .LBB47_6:
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: testq %rax, %rax
-; AVX2-NEXT: js .LBB47_7
-; AVX2-NEXT: # %bb.8:
-; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX2-NEXT: jmp .LBB47_9
-; AVX2-NEXT: .LBB47_7:
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: .LBB47_9:
-; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX2-NEXT: vpextrq $1, %xmm0, %rax
-; AVX2-NEXT: testq %rax, %rax
-; AVX2-NEXT: js .LBB47_10
-; AVX2-NEXT: # %bb.11:
-; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-; AVX2-NEXT: .LBB47_10:
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq %rcx
-; AVX2-NEXT: andl $1, %eax
-; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_4i64_to_4f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_4i64_to_4f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_4i64_to_4f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtuqq2ps %ymm0, %xmm0
-; AVX512VLDQ-NEXT: vzeroupper
-; AVX512VLDQ-NEXT: retq
- %cvt = uitofp <4 x i64> %a to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) {
-; SSE2-LABEL: uitofp_8i32_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; SSE2-NEXT: addps %xmm6, %xmm0
-; SSE2-NEXT: addps %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: addps %xmm6, %xmm1
-; SSE2-NEXT: addps %xmm2, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_8i32_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
-; SSE41-NEXT: movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; SSE41-NEXT: addps %xmm5, %xmm0
-; SSE41-NEXT: addps %xmm3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
-; SSE41-NEXT: addps %xmm5, %xmm1
-; SSE41-NEXT: addps %xmm2, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_8i32_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_8i32_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
-; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_8i32_to_8f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_8i32_to_8f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtudq2ps %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_8i32_to_8f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_8i32_to_8f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtudq2ps %ymm0, %ymm0
-; AVX512VLDQ-NEXT: retq
- %cvt = uitofp <8 x i32> %a to <8 x float>
- ret <8 x float> %cvt
-}
-
-define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
-; SSE2-LABEL: uitofp_8i16_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_8i16_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_8i16_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_8i16_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: uitofp_8i16_to_8f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: retq
- %cvt = uitofp <8 x i16> %a to <8 x float>
- ret <8 x float> %cvt
-}
-
-define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_8i8_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_8i8_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_8i8_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_8i8_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: uitofp_8i8_to_8f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: retq
- %shuf = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %cvt = uitofp <8 x i8> %shuf to <8 x float>
- ret <8 x float> %cvt
-}
-
-define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) {
-; SSE2-LABEL: uitofp_16i8_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_16i8_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_16i8_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_16i8_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: uitofp_16i8_to_8f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vcvtdq2ps %zmm0, %zmm0
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT: retq
- %cvt = uitofp <16 x i8> %a to <16 x float>
- %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x float> %shuf
-}
-
-;
-; Load Signed Integer to Double
-;
-
-define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
-; SSE2-LABEL: sitofp_load_2i64_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: cvtsi2sd %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2sd %rax, %xmm1
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_load_2i64_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: cvtsi2sd %rax, %xmm1
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2sd %rax, %xmm0
-; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: sitofp_load_2i64_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovdqa (%rdi), %xmm0
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0
-; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: sitofp_load_2i64_to_2f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: sitofp_load_2i64_to_2f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: sitofp_load_2i64_to_2f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtqq2pd (%rdi), %xmm0
-; AVX512VLDQ-NEXT: retq
- %ld = load <2 x i64>, <2 x i64> *%a
- %cvt = sitofp <2 x i64> %ld to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) {
-; SSE-LABEL: sitofp_load_2i32_to_2f64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_2i32_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
-; AVX-NEXT: retq
- %ld = load <2 x i32>, <2 x i32> *%a
- %cvt = sitofp <2 x i32> %ld to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_volatile_load_4i32_to_2f64(<4 x i32> *%a) {
-; SSE-LABEL: sitofp_volatile_load_4i32_to_2f64:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %ld = load volatile <4 x i32>, <4 x i32> *%a
- %b = shufflevector <4 x i32> %ld, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
- %cvt = sitofp <2 x i32> %b to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
-; SSE-LABEL: sitofp_load_4i32_to_2f64_2:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2pd (%rdi), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_4i32_to_2f64_2:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2pd (%rdi), %xmm0
-; AVX-NEXT: retq
- %a = load <4 x i32>, <4 x i32>* %x
- %b = sitofp <4 x i32> %a to <4 x double>
- %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1>
- ret <2 x double> %c
-}
-
-define <2 x double> @sitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) {
-; SSE-LABEL: sitofp_volatile_load_4i32_to_2f64_2:
-; SSE: # %bb.0:
-; SSE-NEXT: movaps (%rdi), %xmm0
-; SSE-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_volatile_load_4i32_to_2f64_2:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovaps (%rdi), %xmm0
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %a = load volatile <4 x i32>, <4 x i32>* %x
- %b = sitofp <4 x i32> %a to <4 x double>
- %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1>
- ret <2 x double> %c
-}
-
-define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
-; SSE2-LABEL: sitofp_load_2i16_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_load_2i16_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_2i16_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %ld = load <2 x i16>, <2 x i16> *%a
- %cvt = sitofp <2 x i16> %ld to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) {
-; SSE2-LABEL: sitofp_load_2i8_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movzwl (%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_load_2i8_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movzwl (%rdi), %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_2i8_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: movzwl (%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %ld = load <2 x i8>, <2 x i8> *%a
- %cvt = sitofp <2 x i8> %ld to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
-; SSE2-LABEL: sitofp_load_4i64_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: movdqa 16(%rdi), %xmm2
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: cvtsi2sd %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2sd %rax, %xmm1
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2sd %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: cvtsi2sd %rax, %xmm2
-; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_load_4i64_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: movdqa 16(%rdi), %xmm1
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: cvtsi2sd %rax, %xmm2
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2sd %rax, %xmm0
-; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2sd %rax, %xmm2
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2sd %rax, %xmm1
-; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: sitofp_load_4i64_to_4f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovdqa (%rdi), %xmm0
-; VEX-NEXT: vmovdqa 16(%rdi), %xmm1
-; VEX-NEXT: vpextrq $1, %xmm1, %rax
-; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
-; VEX-NEXT: vmovq %xmm1, %rax
-; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
-; VEX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
-; VEX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: sitofp_load_4i64_to_4f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
-; AVX512F-NEXT: vmovq %xmm1, %rax
-; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX512F-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: sitofp_load_4i64_to_4f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovq %xmm1, %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0
-; AVX512VL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: sitofp_load_4i64_to_4f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
-; AVX512DQ-NEXT: vcvtqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtqq2pd (%rdi), %ymm0
-; AVX512VLDQ-NEXT: retq
- %ld = load <4 x i64>, <4 x i64> *%a
- %cvt = sitofp <4 x i64> %ld to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) {
-; SSE-LABEL: sitofp_load_4i32_to_4f64:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa (%rdi), %xmm1
-; SSE-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_4i32_to_4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2pd (%rdi), %ymm0
-; AVX-NEXT: retq
- %ld = load <4 x i32>, <4 x i32> *%a
- %cvt = sitofp <4 x i32> %ld to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) {
-; SSE2-LABEL: sitofp_load_4i16_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_load_4i16_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxwd (%rdi), %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_4i16_to_4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
-; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT: retq
- %ld = load <4 x i16>, <4 x i16> *%a
- %cvt = sitofp <4 x i16> %ld to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) {
-; SSE2-LABEL: sitofp_load_4i8_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_load_4i8_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd (%rdi), %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_4i8_to_4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
-; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT: retq
- %ld = load <4 x i8>, <4 x i8> *%a
- %cvt = sitofp <4 x i8> %ld to <4 x double>
- ret <4 x double> %cvt
-}
-
-;
-; Load Unsigned Integer to Double
-;
-
-define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
-; SSE2-LABEL: uitofp_load_2i64_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967295,4294967295]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: por {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psrlq $32, %xmm0
-; SSE2-NEXT: por {{.*}}(%rip), %xmm0
-; SSE2-NEXT: subpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_2i64_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: por {{.*}}(%rip), %xmm1
-; SSE41-NEXT: psrlq $32, %xmm0
-; SSE41-NEXT: por {{.*}}(%rip), %xmm0
-; SSE41-NEXT: subpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_load_2i64_to_2f64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_load_2i64_to_2f64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX2-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_load_2i64_to_2f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
-; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_load_2i64_to_2f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1
-; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vaddpd %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_load_2i64_to_2f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtuqq2pd (%rdi), %xmm0
-; AVX512VLDQ-NEXT: retq
- %ld = load <2 x i64>, <2 x i64> *%a
- %cvt = uitofp <2 x i64> %ld to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
-; SSE2-LABEL: uitofp_load_2i32_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_2i32_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_load_2i32_to_2f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
-; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
-; VEX-NEXT: vpsrld $16, %xmm0, %xmm0
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_load_2i32_to_2f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_load_2i32_to_2f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_load_2i32_to_2f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0
-; AVX512VLDQ-NEXT: retq
- %ld = load <2 x i32>, <2 x i32> *%a
- %cvt = uitofp <2 x i32> %ld to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_load_4i32_to_2f64_2(<4 x i32>* %x) {
-; SSE2-LABEL: uitofp_load_4i32_to_2f64_2:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_4i32_to_2f64_2:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_load_4i32_to_2f64_2:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovdqa (%rdi), %xmm0
-; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; VEX-NEXT: vpsrld $16, %xmm0, %xmm0
-; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_load_4i32_to_2f64_2:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_load_4i32_to_2f64_2:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtudq2pd (%rdi), %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_load_4i32_to_2f64_2:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_4i32_to_2f64_2:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %xmm0
-; AVX512VLDQ-NEXT: retq
- %a = load <4 x i32>, <4 x i32>* %x
- %b = uitofp <4 x i32> %a to <4 x double>
- %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1>
- ret <2 x double> %c
-}
-
-define <2 x double> @uitofp_volatile_load_4i32_to_2f64_2(<4 x i32>* %x) {
-; SSE2-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovdqa (%rdi), %xmm0
-; VEX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; VEX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; VEX-NEXT: vpsrld $16, %xmm0, %xmm0
-; VEX-NEXT: vcvtdq2pd %xmm1, %xmm1
-; VEX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; VEX-NEXT: vmulpd {{.*}}(%rip), %xmm0, %xmm0
-; VEX-NEXT: vaddpd %xmm1, %xmm0, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps (%rdi), %xmm0
-; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovaps (%rdi), %xmm0
-; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_volatile_load_4i32_to_2f64_2:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vmovaps (%rdi), %xmm0
-; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %a = load volatile <4 x i32>, <4 x i32>* %x
- %b = uitofp <4 x i32> %a to <4 x double>
- %c = shufflevector <4 x double> %b, <4 x double> undef, <2 x i32> <i32 0, i32 1>
- ret <2 x double> %c
-}
-
-define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
-; SSE2-LABEL: uitofp_load_2i16_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_2i16_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: uitofp_load_2i16_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %ld = load <2 x i16>, <2 x i16> *%a
- %cvt = uitofp <2 x i16> %ld to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
-; SSE2-LABEL: uitofp_load_2i8_to_2f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movzwl (%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_2i8_to_2f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movzwl (%rdi), %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: uitofp_load_2i8_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: movzwl (%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %ld = load <2 x i8>, <2 x i8> *%a
- %cvt = uitofp <2 x i8> %ld to <2 x double>
- ret <2 x double> %cvt
-}
-
-define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
-; SSE2-LABEL: uitofp_load_4i64_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movdqa 16(%rdi), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: psrlq $32, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
-; SSE2-NEXT: subpd %xmm6, %xmm0
-; SSE2-NEXT: addpd %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: psrlq $32, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: subpd %xmm6, %xmm1
-; SSE2-NEXT: addpd %xmm2, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_4i64_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: movdqa 16(%rdi), %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4841369599423283200,4841369599423283200]
-; SSE41-NEXT: por %xmm4, %xmm3
-; SSE41-NEXT: psrlq $32, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [4985484787499139072,4985484787499139072]
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd {{.*#+}} xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25]
-; SSE41-NEXT: subpd %xmm6, %xmm0
-; SSE41-NEXT: addpd %xmm3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: por %xmm4, %xmm2
-; SSE41-NEXT: psrlq $32, %xmm1
-; SSE41-NEXT: por %xmm5, %xmm1
-; SSE41-NEXT: subpd %xmm6, %xmm1
-; SSE41-NEXT: addpd %xmm2, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_load_4i64_to_4f64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7]
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_load_4i64_to_4f64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_load_4i64_to_4f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200]
-; AVX512F-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072]
-; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vbroadcastsd {{.*#+}} ymm2 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25]
-; AVX512F-NEXT: vsubpd %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_load_4i64_to_4f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
-; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1
-; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1
-; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; AVX512VL-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_load_4i64_to_4f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
-; AVX512DQ-NEXT: vcvtuqq2pd %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtuqq2pd (%rdi), %ymm0
-; AVX512VLDQ-NEXT: retq
- %ld = load <4 x i64>, <4 x i64> *%a
- %cvt = uitofp <4 x i64> %ld to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) {
-; SSE2-LABEL: uitofp_load_4i32_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
-; SSE2-NEXT: mulpd %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [65535,0,65535,0,0,0,0,0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE2-NEXT: addpd %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm5
-; SSE2-NEXT: mulpd %xmm2, %xmm5
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: cvtdq2pd %xmm4, %xmm1
-; SSE2-NEXT: addpd %xmm5, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_4i32_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: movapd {{.*#+}} xmm2 = [6.5536E+4,6.5536E+4]
-; SSE41-NEXT: mulpd %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0
-; SSE41-NEXT: addpd %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm5
-; SSE41-NEXT: mulpd %xmm2, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1],xmm4[2],xmm3[3],xmm4[4,5,6,7]
-; SSE41-NEXT: cvtdq2pd %xmm4, %xmm1
-; SSE41-NEXT: addpd %xmm5, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_load_4i32_to_4f64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX1-NEXT: vcvtdq2pd %xmm1, %ymm1
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX1-NEXT: vmulpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_load_4i32_to_4f64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vcvtdq2pd %xmm1, %ymm1
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [6.5536E+4,6.5536E+4,6.5536E+4,6.5536E+4]
-; AVX2-NEXT: vmulpd %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vxorpd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX2-NEXT: vaddpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_load_4i32_to_4f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps (%rdi), %xmm0
-; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_load_4i32_to_4f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtudq2pd (%rdi), %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_load_4i32_to_4f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtudq2pd (%rdi), %ymm0
-; AVX512VLDQ-NEXT: retq
- %ld = load <4 x i32>, <4 x i32> *%a
- %cvt = uitofp <4 x i32> %ld to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) {
-; SSE2-LABEL: uitofp_load_4i16_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_4i16_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: uitofp_load_4i16_to_4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT: retq
- %ld = load <4 x i16>, <4 x i16> *%a
- %cvt = uitofp <4 x i16> %ld to <4 x double>
- ret <4 x double> %cvt
-}
-
-define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) {
-; SSE2-LABEL: uitofp_load_4i8_to_4f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_4i8_to_4f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: uitofp_load_4i8_to_4f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT: vcvtdq2pd %xmm0, %ymm0
-; AVX-NEXT: retq
- %ld = load <4 x i8>, <4 x i8> *%a
- %cvt = uitofp <4 x i8> %ld to <4 x double>
- ret <4 x double> %cvt
-}
-
-;
-; Load Signed Integer to Float
-;
-
-define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
-; SSE2-LABEL: sitofp_load_4i64_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: movdqa 16(%rdi), %xmm0
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_load_4i64_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: movdqa 16(%rdi), %xmm1
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: sitofp_load_4i64_to_4f32:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovdqa (%rdi), %xmm0
-; VEX-NEXT: vmovdqa 16(%rdi), %xmm1
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; VEX-NEXT: vmovq %xmm1, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; VEX-NEXT: vpextrq $1, %xmm1, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: sitofp_load_4i64_to_4f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm1, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm1, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
-; AVX512DQ-NEXT: vcvtqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtqq2psy (%rdi), %xmm0
-; AVX512VLDQ-NEXT: retq
- %ld = load <4 x i64>, <4 x i64> *%a
- %cvt = sitofp <4 x i64> %ld to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) {
-; SSE-LABEL: sitofp_load_4i32_to_4f32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2ps (%rdi), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_4i32_to_4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2ps (%rdi), %xmm0
-; AVX-NEXT: retq
- %ld = load <4 x i32>, <4 x i32> *%a
- %cvt = sitofp <4 x i32> %ld to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) {
-; SSE2-LABEL: sitofp_load_4i16_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_load_4i16_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxwd (%rdi), %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_4i16_to_4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT: retq
- %ld = load <4 x i16>, <4 x i16> *%a
- %cvt = sitofp <4 x i16> %ld to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) {
-; SSE2-LABEL: sitofp_load_4i8_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_load_4i8_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd (%rdi), %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_4i8_to_4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT: retq
- %ld = load <4 x i8>, <4 x i8> *%a
- %cvt = sitofp <4 x i8> %ld to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) {
-; SSE2-LABEL: sitofp_load_8i64_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: movdqa 16(%rdi), %xmm0
-; SSE2-NEXT: movdqa 32(%rdi), %xmm2
-; SSE2-NEXT: movdqa 48(%rdi), %xmm3
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: xorps %xmm4, %xmm4
-; SSE2-NEXT: cvtsi2ss %rax, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: cvtsi2ss %rax, %xmm2
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_load_8i64_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: movdqa 16(%rdi), %xmm1
-; SSE41-NEXT: movdqa 32(%rdi), %xmm2
-; SSE41-NEXT: movdqa 48(%rdi), %xmm3
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm4
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: xorps %xmm4, %xmm4
-; SSE41-NEXT: cvtsi2ss %rax, %xmm4
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3]
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT: pextrq $1, %xmm2, %rax
-; SSE41-NEXT: xorps %xmm4, %xmm4
-; SSE41-NEXT: cvtsi2ss %rax, %xmm4
-; SSE41-NEXT: movq %xmm2, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3]
-; SSE41-NEXT: movq %xmm3, %rax
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; SSE41-NEXT: pextrq $1, %xmm3, %rax
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: sitofp_load_8i64_to_8f32:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovdqa (%rdi), %xmm0
-; VEX-NEXT: vmovdqa 16(%rdi), %xmm1
-; VEX-NEXT: vmovdqa 32(%rdi), %xmm2
-; VEX-NEXT: vmovdqa 48(%rdi), %xmm3
-; VEX-NEXT: vpextrq $1, %xmm2, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4
-; VEX-NEXT: vmovq %xmm2, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2
-; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
-; VEX-NEXT: vmovq %xmm3, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
-; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; VEX-NEXT: vpextrq $1, %xmm3, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; VEX-NEXT: vmovq %xmm1, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; VEX-NEXT: vpextrq $1, %xmm1, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; VEX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: sitofp_load_8i64_to_8f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4
-; AVX512F-NEXT: vmovq %xmm2, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
-; AVX512F-NEXT: vmovq %xmm3, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; AVX512F-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm1, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: sitofp_load_8i64_to_8f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovq %xmm2, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
-; AVX512VL-NEXT: vmovq %xmm3, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm1, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: sitofp_load_8i64_to_8f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vcvtqq2ps (%rdi), %ymm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: sitofp_load_8i64_to_8f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtqq2ps (%rdi), %ymm0
-; AVX512VLDQ-NEXT: retq
- %ld = load <8 x i64>, <8 x i64> *%a
- %cvt = sitofp <8 x i64> %ld to <8 x float>
- ret <8 x float> %cvt
-}
-
-define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) {
-; SSE-LABEL: sitofp_load_8i32_to_8f32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2ps (%rdi), %xmm0
-; SSE-NEXT: cvtdq2ps 16(%rdi), %xmm1
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_load_8i32_to_8f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2ps (%rdi), %ymm0
-; AVX-NEXT: retq
- %ld = load <8 x i32>, <8 x i32> *%a
- %cvt = sitofp <8 x i32> %ld to <8 x float>
- ret <8 x float> %cvt
-}
-
-define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) {
-; SSE2-LABEL: sitofp_load_8i16_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_load_8i16_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1
-; SSE41-NEXT: pmovsxwd (%rdi), %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sitofp_load_8i16_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm0
-; AVX1-NEXT: vpmovsxwd (%rdi), %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sitofp_load_8i16_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sitofp_load_8i16_to_8f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0
-; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: retq
- %ld = load <8 x i16>, <8 x i16> *%a
- %cvt = sitofp <8 x i16> %ld to <8 x float>
- ret <8 x float> %cvt
-}
-
-define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) {
-; SSE2-LABEL: sitofp_load_8i8_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: sitofp_load_8i8_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbd 4(%rdi), %xmm1
-; SSE41-NEXT: pmovsxbd (%rdi), %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sitofp_load_8i8_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm0
-; AVX1-NEXT: vpmovsxbd (%rdi), %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sitofp_load_8i8_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sitofp_load_8i8_to_8f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0
-; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: retq
- %ld = load <8 x i8>, <8 x i8> *%a
- %cvt = sitofp <8 x i8> %ld to <8 x float>
- ret <8 x float> %cvt
-}
-
-;
-; Load Unsigned Integer to Float
-;
-
-define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
-; SSE2-LABEL: uitofp_load_4i64_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm2
-; SSE2-NEXT: movdqa 16(%rdi), %xmm0
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB81_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: jmp .LBB81_3
-; SSE2-NEXT: .LBB81_1:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: addss %xmm1, %xmm1
-; SSE2-NEXT: .LBB81_3:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB81_4
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm3
-; SSE2-NEXT: jmp .LBB81_6
-; SSE2-NEXT: .LBB81_4:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm3
-; SSE2-NEXT: addss %xmm3, %xmm3
-; SSE2-NEXT: .LBB81_6:
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB81_7
-; SSE2-NEXT: # %bb.8:
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: jmp .LBB81_9
-; SSE2-NEXT: .LBB81_7:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: addss %xmm0, %xmm0
-; SSE2-NEXT: .LBB81_9:
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB81_10
-; SSE2-NEXT: # %bb.11:
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: cvtsi2ss %rax, %xmm2
-; SSE2-NEXT: jmp .LBB81_12
-; SSE2-NEXT: .LBB81_10:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: cvtsi2ss %rax, %xmm2
-; SSE2-NEXT: addss %xmm2, %xmm2
-; SSE2-NEXT: .LBB81_12:
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_4i64_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: movdqa 16(%rdi), %xmm1
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB81_1
-; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: jmp .LBB81_3
-; SSE41-NEXT: .LBB81_1:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: addss %xmm2, %xmm2
-; SSE41-NEXT: .LBB81_3:
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB81_4
-; SSE41-NEXT: # %bb.5:
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: jmp .LBB81_6
-; SSE41-NEXT: .LBB81_4:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: addss %xmm0, %xmm0
-; SSE41-NEXT: .LBB81_6:
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB81_7
-; SSE41-NEXT: # %bb.8:
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: jmp .LBB81_9
-; SSE41-NEXT: .LBB81_7:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: addss %xmm2, %xmm2
-; SSE41-NEXT: .LBB81_9:
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB81_10
-; SSE41-NEXT: # %bb.11:
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT: retq
-; SSE41-NEXT: .LBB81_10:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: addss %xmm1, %xmm1
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_load_4i64_to_4f32:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovdqa (%rdi), %xmm2
-; VEX-NEXT: vmovdqa 16(%rdi), %xmm0
-; VEX-NEXT: vpextrq $1, %xmm2, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB81_1
-; VEX-NEXT: # %bb.2:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT: jmp .LBB81_3
-; VEX-NEXT: .LBB81_1:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1
-; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
-; VEX-NEXT: .LBB81_3:
-; VEX-NEXT: vmovq %xmm2, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB81_4
-; VEX-NEXT: # %bb.5:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; VEX-NEXT: jmp .LBB81_6
-; VEX-NEXT: .LBB81_4:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2
-; VEX-NEXT: .LBB81_6:
-; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB81_7
-; VEX-NEXT: # %bb.8:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; VEX-NEXT: jmp .LBB81_9
-; VEX-NEXT: .LBB81_7:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2
-; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2
-; VEX-NEXT: .LBB81_9:
-; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB81_10
-; VEX-NEXT: # %bb.11:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VEX-NEXT: retq
-; VEX-NEXT: .LBB81_10:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0
-; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_load_4i64_to_4f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm1, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm1, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
-; AVX512DQ-NEXT: vcvtuqq2ps %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtuqq2psy (%rdi), %xmm0
-; AVX512VLDQ-NEXT: retq
- %ld = load <4 x i64>, <4 x i64> *%a
- %cvt = uitofp <4 x i64> %ld to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
-; SSE2-LABEL: uitofp_load_4i32_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: por {{.*}}(%rip), %xmm1
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: por {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addps {{.*}}(%rip), %xmm0
-; SSE2-NEXT: addps %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_4i32_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; SSE41-NEXT: addps {{.*}}(%rip), %xmm0
-; SSE41-NEXT: addps %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_load_4i32_to_4f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
-; AVX1-NEXT: vaddps {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_load_4i32_to_4f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [1392508928,1392508928,1392508928,1392508928]
-; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
-; AVX2-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; AVX2-NEXT: vaddps %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vaddps %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_load_4i32_to_4f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps (%rdi), %xmm0
-; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_load_4i32_to_4f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtudq2ps (%rdi), %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_load_4i32_to_4f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0
-; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtudq2ps (%rdi), %xmm0
-; AVX512VLDQ-NEXT: retq
- %ld = load <4 x i32>, <4 x i32> *%a
- %cvt = uitofp <4 x i32> %ld to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) {
-; SSE2-LABEL: uitofp_load_4i16_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_4i16_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: uitofp_load_4i16_to_4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT: retq
- %ld = load <4 x i16>, <4 x i16> *%a
- %cvt = uitofp <4 x i16> %ld to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) {
-; SSE2-LABEL: uitofp_load_4i8_to_4f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_4i8_to_4f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: uitofp_load_4i8_to_4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT: retq
- %ld = load <4 x i8>, <4 x i8> *%a
- %cvt = uitofp <4 x i8> %ld to <4 x float>
- ret <4 x float> %cvt
-}
-
-define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) {
-; SSE2-LABEL: uitofp_load_8i64_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm5
-; SSE2-NEXT: movdqa 16(%rdi), %xmm0
-; SSE2-NEXT: movdqa 32(%rdi), %xmm2
-; SSE2-NEXT: movdqa 48(%rdi), %xmm1
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB85_1
-; SSE2-NEXT: # %bb.2:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm3
-; SSE2-NEXT: jmp .LBB85_3
-; SSE2-NEXT: .LBB85_1:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm3
-; SSE2-NEXT: addss %xmm3, %xmm3
-; SSE2-NEXT: .LBB85_3:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB85_4
-; SSE2-NEXT: # %bb.5:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm4
-; SSE2-NEXT: jmp .LBB85_6
-; SSE2-NEXT: .LBB85_4:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm4
-; SSE2-NEXT: addss %xmm4, %xmm4
-; SSE2-NEXT: .LBB85_6:
-; SSE2-NEXT: movq %xmm5, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB85_7
-; SSE2-NEXT: # %bb.8:
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: jmp .LBB85_9
-; SSE2-NEXT: .LBB85_7:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: addss %xmm0, %xmm0
-; SSE2-NEXT: .LBB85_9:
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
-; SSE2-NEXT: movq %xmm5, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB85_10
-; SSE2-NEXT: # %bb.11:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm6
-; SSE2-NEXT: jmp .LBB85_12
-; SSE2-NEXT: .LBB85_10:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm6
-; SSE2-NEXT: addss %xmm6, %xmm6
-; SSE2-NEXT: .LBB85_12:
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB85_13
-; SSE2-NEXT: # %bb.14:
-; SSE2-NEXT: xorps %xmm5, %xmm5
-; SSE2-NEXT: cvtsi2ss %rax, %xmm5
-; SSE2-NEXT: jmp .LBB85_15
-; SSE2-NEXT: .LBB85_13:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm5, %xmm5
-; SSE2-NEXT: cvtsi2ss %rax, %xmm5
-; SSE2-NEXT: addss %xmm5, %xmm5
-; SSE2-NEXT: .LBB85_15:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: movq %xmm1, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB85_16
-; SSE2-NEXT: # %bb.17:
-; SSE2-NEXT: cvtsi2ss %rax, %xmm7
-; SSE2-NEXT: jmp .LBB85_18
-; SSE2-NEXT: .LBB85_16:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: cvtsi2ss %rax, %xmm7
-; SSE2-NEXT: addss %xmm7, %xmm7
-; SSE2-NEXT: .LBB85_18:
-; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB85_19
-; SSE2-NEXT: # %bb.20:
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: jmp .LBB85_21
-; SSE2-NEXT: .LBB85_19:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: cvtsi2ss %rax, %xmm1
-; SSE2-NEXT: addss %xmm1, %xmm1
-; SSE2-NEXT: .LBB85_21:
-; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: testq %rax, %rax
-; SSE2-NEXT: js .LBB85_22
-; SSE2-NEXT: # %bb.23:
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: cvtsi2ss %rax, %xmm2
-; SSE2-NEXT: jmp .LBB85_24
-; SSE2-NEXT: .LBB85_22:
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq %rcx
-; SSE2-NEXT: andl $1, %eax
-; SSE2-NEXT: orq %rcx, %rax
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: cvtsi2ss %rax, %xmm2
-; SSE2-NEXT: addss %xmm2, %xmm2
-; SSE2-NEXT: .LBB85_24:
-; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_8i64_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: movdqa 16(%rdi), %xmm4
-; SSE41-NEXT: movdqa 32(%rdi), %xmm1
-; SSE41-NEXT: movdqa 48(%rdi), %xmm2
-; SSE41-NEXT: pextrq $1, %xmm0, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB85_1
-; SSE41-NEXT: # %bb.2:
-; SSE41-NEXT: cvtsi2ss %rax, %xmm3
-; SSE41-NEXT: jmp .LBB85_3
-; SSE41-NEXT: .LBB85_1:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm3
-; SSE41-NEXT: addss %xmm3, %xmm3
-; SSE41-NEXT: .LBB85_3:
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB85_4
-; SSE41-NEXT: # %bb.5:
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: jmp .LBB85_6
-; SSE41-NEXT: .LBB85_4:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: addss %xmm0, %xmm0
-; SSE41-NEXT: .LBB85_6:
-; SSE41-NEXT: movq %xmm4, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB85_7
-; SSE41-NEXT: # %bb.8:
-; SSE41-NEXT: cvtsi2ss %rax, %xmm5
-; SSE41-NEXT: jmp .LBB85_9
-; SSE41-NEXT: .LBB85_7:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm5
-; SSE41-NEXT: addss %xmm5, %xmm5
-; SSE41-NEXT: .LBB85_9:
-; SSE41-NEXT: pextrq $1, %xmm4, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB85_10
-; SSE41-NEXT: # %bb.11:
-; SSE41-NEXT: xorps %xmm4, %xmm4
-; SSE41-NEXT: cvtsi2ss %rax, %xmm4
-; SSE41-NEXT: jmp .LBB85_12
-; SSE41-NEXT: .LBB85_10:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm4, %xmm4
-; SSE41-NEXT: cvtsi2ss %rax, %xmm4
-; SSE41-NEXT: addss %xmm4, %xmm4
-; SSE41-NEXT: .LBB85_12:
-; SSE41-NEXT: pextrq $1, %xmm1, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB85_13
-; SSE41-NEXT: # %bb.14:
-; SSE41-NEXT: cvtsi2ss %rax, %xmm6
-; SSE41-NEXT: jmp .LBB85_15
-; SSE41-NEXT: .LBB85_13:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: cvtsi2ss %rax, %xmm6
-; SSE41-NEXT: addss %xmm6, %xmm6
-; SSE41-NEXT: .LBB85_15:
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB85_16
-; SSE41-NEXT: # %bb.17:
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: jmp .LBB85_18
-; SSE41-NEXT: .LBB85_16:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm1, %xmm1
-; SSE41-NEXT: cvtsi2ss %rax, %xmm1
-; SSE41-NEXT: addss %xmm1, %xmm1
-; SSE41-NEXT: .LBB85_18:
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3]
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0],xmm0[3]
-; SSE41-NEXT: movq %xmm2, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB85_19
-; SSE41-NEXT: # %bb.20:
-; SSE41-NEXT: xorps %xmm3, %xmm3
-; SSE41-NEXT: cvtsi2ss %rax, %xmm3
-; SSE41-NEXT: jmp .LBB85_21
-; SSE41-NEXT: .LBB85_19:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm3, %xmm3
-; SSE41-NEXT: cvtsi2ss %rax, %xmm3
-; SSE41-NEXT: addss %xmm3, %xmm3
-; SSE41-NEXT: .LBB85_21:
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0]
-; SSE41-NEXT: pextrq $1, %xmm2, %rax
-; SSE41-NEXT: testq %rax, %rax
-; SSE41-NEXT: js .LBB85_22
-; SSE41-NEXT: # %bb.23:
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; SSE41-NEXT: retq
-; SSE41-NEXT: .LBB85_22:
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shrq %rcx
-; SSE41-NEXT: andl $1, %eax
-; SSE41-NEXT: orq %rcx, %rax
-; SSE41-NEXT: xorps %xmm2, %xmm2
-; SSE41-NEXT: cvtsi2ss %rax, %xmm2
-; SSE41-NEXT: addss %xmm2, %xmm2
-; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: uitofp_load_8i64_to_8f32:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovdqa (%rdi), %xmm1
-; VEX-NEXT: vmovdqa 16(%rdi), %xmm0
-; VEX-NEXT: vmovdqa 32(%rdi), %xmm4
-; VEX-NEXT: vmovdqa 48(%rdi), %xmm3
-; VEX-NEXT: vpextrq $1, %xmm4, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB85_1
-; VEX-NEXT: # %bb.2:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; VEX-NEXT: jmp .LBB85_3
-; VEX-NEXT: .LBB85_1:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2
-; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2
-; VEX-NEXT: .LBB85_3:
-; VEX-NEXT: vmovq %xmm4, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB85_4
-; VEX-NEXT: # %bb.5:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5
-; VEX-NEXT: jmp .LBB85_6
-; VEX-NEXT: .LBB85_4:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4
-; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm5
-; VEX-NEXT: .LBB85_6:
-; VEX-NEXT: vmovq %xmm3, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB85_7
-; VEX-NEXT: # %bb.8:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4
-; VEX-NEXT: jmp .LBB85_9
-; VEX-NEXT: .LBB85_7:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4
-; VEX-NEXT: vaddss %xmm4, %xmm4, %xmm4
-; VEX-NEXT: .LBB85_9:
-; VEX-NEXT: vpextrq $1, %xmm3, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB85_10
-; VEX-NEXT: # %bb.11:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3
-; VEX-NEXT: jmp .LBB85_12
-; VEX-NEXT: .LBB85_10:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3
-; VEX-NEXT: vaddss %xmm3, %xmm3, %xmm3
-; VEX-NEXT: .LBB85_12:
-; VEX-NEXT: vpextrq $1, %xmm1, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB85_13
-; VEX-NEXT: # %bb.14:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6
-; VEX-NEXT: jmp .LBB85_15
-; VEX-NEXT: .LBB85_13:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm6, %xmm6
-; VEX-NEXT: vaddss %xmm6, %xmm6, %xmm6
-; VEX-NEXT: .LBB85_15:
-; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[2,3]
-; VEX-NEXT: vmovq %xmm1, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB85_16
-; VEX-NEXT: # %bb.17:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1
-; VEX-NEXT: jmp .LBB85_18
-; VEX-NEXT: .LBB85_16:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm1
-; VEX-NEXT: vaddss %xmm1, %xmm1, %xmm1
-; VEX-NEXT: .LBB85_18:
-; VEX-NEXT: vinsertps {{.*#+}} xmm5 = xmm1[0],xmm6[0],xmm1[2,3]
-; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm4[0],xmm2[3]
-; VEX-NEXT: vmovq %xmm0, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB85_19
-; VEX-NEXT: # %bb.20:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2
-; VEX-NEXT: jmp .LBB85_21
-; VEX-NEXT: .LBB85_19:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2
-; VEX-NEXT: vaddss %xmm2, %xmm2, %xmm2
-; VEX-NEXT: .LBB85_21:
-; VEX-NEXT: vinsertps {{.*#+}} xmm2 = xmm5[0,1],xmm2[0],xmm5[3]
-; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
-; VEX-NEXT: vpextrq $1, %xmm0, %rax
-; VEX-NEXT: testq %rax, %rax
-; VEX-NEXT: js .LBB85_22
-; VEX-NEXT: # %bb.23:
-; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0
-; VEX-NEXT: jmp .LBB85_24
-; VEX-NEXT: .LBB85_22:
-; VEX-NEXT: movq %rax, %rcx
-; VEX-NEXT: shrq %rcx
-; VEX-NEXT: andl $1, %eax
-; VEX-NEXT: orq %rcx, %rax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm7, %xmm0
-; VEX-NEXT: vaddss %xmm0, %xmm0, %xmm0
-; VEX-NEXT: .LBB85_24:
-; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0]
-; VEX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_load_8i64_to_8f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4
-; AVX512F-NEXT: vmovq %xmm2, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
-; AVX512F-NEXT: vmovq %xmm3, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; AVX512F-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
-; AVX512F-NEXT: vmovq %xmm0, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; AVX512F-NEXT: vmovq %xmm1, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512F-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1
-; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512F-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_load_8i64_to_8f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
-; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
-; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovq %xmm2, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3]
-; AVX512VL-NEXT: vmovq %xmm3, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3]
-; AVX512VL-NEXT: vpextrq $1, %xmm3, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3]
-; AVX512VL-NEXT: vmovq %xmm1, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0],xmm0[3]
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax
-; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1
-; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX512VL-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_load_8i64_to_8f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vcvtuqq2ps (%rdi), %ymm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_8i64_to_8f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtuqq2ps (%rdi), %ymm0
-; AVX512VLDQ-NEXT: retq
- %ld = load <8 x i64>, <8 x i64> *%a
- %cvt = uitofp <8 x i64> %ld to <8 x float>
- ret <8 x float> %cvt
-}
-
-define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) {
-; SSE2-LABEL: uitofp_load_8i32_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: movdqa 16(%rdi), %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928]
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; SSE2-NEXT: addps %xmm6, %xmm0
-; SSE2-NEXT: addps %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: addps %xmm6, %xmm1
-; SSE2-NEXT: addps %xmm2, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_8i32_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa (%rdi), %xmm0
-; SSE41-NEXT: movdqa 16(%rdi), %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7]
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
-; SSE41-NEXT: movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; SSE41-NEXT: addps %xmm5, %xmm0
-; SSE41-NEXT: addps %xmm3, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
-; SSE41-NEXT: addps %xmm5, %xmm1
-; SSE41-NEXT: addps %xmm2, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_load_8i32_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps (%rdi), %ymm0
-; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX1-NEXT: vmulps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_load_8i32_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa (%rdi), %ymm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200,1258291200]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928,1392508928]
-; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
-; AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11]
-; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vaddps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: uitofp_load_8i32_to_8f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovaps (%rdi), %ymm0
-; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: uitofp_load_8i32_to_8f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtudq2ps (%rdi), %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: uitofp_load_8i32_to_8f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovaps (%rdi), %ymm0
-; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: uitofp_load_8i32_to_8f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtudq2ps (%rdi), %ymm0
-; AVX512VLDQ-NEXT: retq
- %ld = load <8 x i32>, <8 x i32> *%a
- %cvt = uitofp <8 x i32> %ld to <8 x float>
- ret <8 x float> %cvt
-}
-
-define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) {
-; SSE2-LABEL: uitofp_load_8i16_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_8i16_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_load_8i16_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_load_8i16_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: uitofp_load_8i16_to_8f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: retq
- %ld = load <8 x i16>, <8 x i16> *%a
- %cvt = uitofp <8 x i16> %ld to <8 x float>
- ret <8 x float> %cvt
-}
-
-define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) {
-; SSE2-LABEL: uitofp_load_8i8_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: uitofp_load_8i8_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: uitofp_load_8i8_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: uitofp_load_8i8_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: uitofp_load_8i8_to_8f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: retq
- %ld = load <8 x i8>, <8 x i8> *%a
- %cvt = uitofp <8 x i8> %ld to <8 x float>
- ret <8 x float> %cvt
-}
-
-;
-; Aggregates
-;
-
-%Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }>
-define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
-; SSE2-LABEL: aggregate_sitofp_8i16_to_8f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq 24(%rdi), %rax
-; SSE2-NEXT: movdqu 8(%rdi), %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE2-NEXT: movaps %xmm0, 16(%rax)
-; SSE2-NEXT: movaps %xmm1, (%rax)
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movq 24(%rdi), %rax
-; SSE41-NEXT: pmovsxwd 16(%rdi), %xmm0
-; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1
-; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1
-; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE41-NEXT: movaps %xmm0, 16(%rax)
-; SSE41-NEXT: movaps %xmm1, (%rax)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: movq 24(%rdi), %rax
-; AVX1-NEXT: vpmovsxwd 16(%rdi), %xmm0
-; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT: vmovaps %ymm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: aggregate_sitofp_8i16_to_8f32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movq 24(%rdi), %rax
-; AVX2-NEXT: vpmovsxwd 8(%rdi), %ymm0
-; AVX2-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT: vmovaps %ymm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: aggregate_sitofp_8i16_to_8f32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movq 24(%rdi), %rax
-; AVX512-NEXT: vpmovsxwd 8(%rdi), %ymm0
-; AVX512-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX512-NEXT: vmovaps %ymm0, (%rax)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = load %Arguments, %Arguments* %a0, align 1
- %2 = extractvalue %Arguments %1, 1
- %3 = extractvalue %Arguments %1, 2
- %4 = sitofp <8 x i16> %2 to <8 x float>
- store <8 x float> %4, <8 x float>* %3, align 32
- ret void
-}
-
-define <2 x double> @sitofp_i32_to_2f64(<2 x double> %a0, i32 %a1) nounwind {
-; SSE-LABEL: sitofp_i32_to_2f64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtsi2sd %edi, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_i32_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0
-; AVX-NEXT: retq
- %cvt = sitofp i32 %a1 to double
- %res = insertelement <2 x double> %a0, double %cvt, i32 0
- ret <2 x double> %res
-}
-
-define <4 x float> @sitofp_i32_to_4f32(<4 x float> %a0, i32 %a1) nounwind {
-; SSE-LABEL: sitofp_i32_to_4f32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtsi2ss %edi, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_i32_to_4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0
-; AVX-NEXT: retq
- %cvt = sitofp i32 %a1 to float
- %res = insertelement <4 x float> %a0, float %cvt, i32 0
- ret <4 x float> %res
-}
-
-define <2 x double> @sitofp_i64_to_2f64(<2 x double> %a0, i64 %a1) nounwind {
-; SSE-LABEL: sitofp_i64_to_2f64:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtsi2sd %rdi, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_i64_to_2f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0
-; AVX-NEXT: retq
- %cvt = sitofp i64 %a1 to double
- %res = insertelement <2 x double> %a0, double %cvt, i32 0
- ret <2 x double> %res
-}
-
-define <4 x float> @sitofp_i64_to_4f32(<4 x float> %a0, i64 %a1) nounwind {
-; SSE-LABEL: sitofp_i64_to_4f32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtsi2ss %rdi, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: sitofp_i64_to_4f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0
-; AVX-NEXT: retq
- %cvt = sitofp i64 %a1 to float
- %res = insertelement <4 x float> %a0, float %cvt, i32 0
- ret <4 x float> %res
-}
-
-; Extract from int vector and convert to FP.
-
-define float @extract0_sitofp_v4i32_f32(<4 x i32> %x) nounwind {
-; SSE-LABEL: extract0_sitofp_v4i32_f32:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: extract0_sitofp_v4i32_f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT: retq
- %e = extractelement <4 x i32> %x, i32 0
- %r = sitofp i32 %e to float
- ret float %r
-}
-
-define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind {
-; SSE-LABEL: extract0_sitofp_v4i32_f32i_multiuse1:
-; SSE: # %bb.0:
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE-NEXT: incl %eax
-; SSE-NEXT: cvtsi2ss %eax, %xmm1
-; SSE-NEXT: divss %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: extract0_sitofp_v4i32_f32i_multiuse1:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT: incl %eax
-; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1
-; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
- %e = extractelement <4 x i32> %x, i32 0
- %f = sitofp i32 %e to float
- %e1 = add i32 %e, 1
- %f1 = sitofp i32 %e1 to float
- %r = fdiv float %f, %f1
- ret float %r
-}
-
-define float @extract0_sitofp_v4i32_f32_multiuse2(<4 x i32> %x, i32* %p) nounwind {
-; SSE-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
-; SSE: # %bb.0:
-; SSE-NEXT: cvtdq2ps %xmm0, %xmm1
-; SSE-NEXT: movss %xmm0, (%rdi)
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: extract0_sitofp_v4i32_f32_multiuse2:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm1
-; AVX-NEXT: vmovss %xmm0, (%rdi)
-; AVX-NEXT: vmovaps %xmm1, %xmm0
-; AVX-NEXT: retq
- %e = extractelement <4 x i32> %x, i32 0
- %r = sitofp i32 %e to float
- store i32 %e, i32* %p
- ret float %r
-}
-
-define double @extract0_sitofp_v4i32_f64(<4 x i32> %x) nounwind {
-; SSE-LABEL: extract0_sitofp_v4i32_f64:
-; SSE: # %bb.0:
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2sd %eax, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: extract0_sitofp_v4i32_f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %e = extractelement <4 x i32> %x, i32 0
- %r = sitofp i32 %e to double
- ret double %r
-}
-
-define float @extract0_uitofp_v4i32_f32(<4 x i32> %x) nounwind {
-; SSE-LABEL: extract0_uitofp_v4i32_f32:
-; SSE: # %bb.0:
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2ss %rax, %xmm0
-; SSE-NEXT: retq
-;
-; VEX-LABEL: extract0_uitofp_v4i32_f32:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovd %xmm0, %eax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: extract0_uitofp_v4i32_f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: extract0_uitofp_v4i32_f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: extract0_uitofp_v4i32_f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %e = extractelement <4 x i32> %x, i32 0
- %r = uitofp i32 %e to float
- ret float %r
-}
-
-define double @extract0_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
-; SSE-LABEL: extract0_uitofp_v4i32_f64:
-; SSE: # %bb.0:
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: cvtsi2sd %rax, %xmm0
-; SSE-NEXT: retq
-;
-; VEX-LABEL: extract0_uitofp_v4i32_f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vmovd %xmm0, %eax
-; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: extract0_uitofp_v4i32_f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: extract0_uitofp_v4i32_f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: extract0_uitofp_v4i32_f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: extract0_uitofp_v4i32_f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %e = extractelement <4 x i32> %x, i32 0
- %r = uitofp i32 %e to double
- ret double %r
-}
-
-; Extract non-zero element from int vector and convert to FP.
-
-define float @extract3_sitofp_v4i32_f32(<4 x i32> %x) nounwind {
-; SSE-LABEL: extract3_sitofp_v4i32_f32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE-NEXT: cvtdq2ps %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: extract3_sitofp_v4i32_f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0
-; AVX-NEXT: retq
- %e = extractelement <4 x i32> %x, i32 3
- %r = sitofp i32 %e to float
- ret float %r
-}
-
-define double @extract3_sitofp_v4i32_f64(<4 x i32> %x) nounwind {
-; SSE2-LABEL: extract3_sitofp_v4i32_f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2sd %eax, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: extract3_sitofp_v4i32_f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: extractps $3, %xmm0, %eax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2sd %eax, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: extract3_sitofp_v4i32_f64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX-NEXT: vcvtdq2pd %xmm0, %xmm0
-; AVX-NEXT: retq
- %e = extractelement <4 x i32> %x, i32 3
- %r = sitofp i32 %e to double
- ret double %r
-}
-
-define float @extract3_uitofp_v4i32_f32(<4 x i32> %x) nounwind {
-; SSE2-LABEL: extract3_uitofp_v4i32_f32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2ss %rax, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: extract3_uitofp_v4i32_f32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: extractps $3, %xmm0, %eax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2ss %rax, %xmm0
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: extract3_uitofp_v4i32_f32:
-; VEX: # %bb.0:
-; VEX-NEXT: vextractps $3, %xmm0, %eax
-; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: extract3_uitofp_v4i32_f32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: extract3_uitofp_v4i32_f32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtudq2ps %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: extract3_uitofp_v4i32_f32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512DQ-NEXT: vcvtudq2ps %zmm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f32:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VLDQ-NEXT: vcvtudq2ps %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %e = extractelement <4 x i32> %x, i32 3
- %r = uitofp i32 %e to float
- ret float %r
-}
-
-define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
-; SSE2-LABEL: extract3_uitofp_v4i32_f64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorps %xmm0, %xmm0
-; SSE2-NEXT: cvtsi2sd %rax, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: extract3_uitofp_v4i32_f64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: extractps $3, %xmm0, %eax
-; SSE41-NEXT: xorps %xmm0, %xmm0
-; SSE41-NEXT: cvtsi2sd %rax, %xmm0
-; SSE41-NEXT: retq
-;
-; VEX-LABEL: extract3_uitofp_v4i32_f64:
-; VEX: # %bb.0:
-; VEX-NEXT: vextractps $3, %xmm0, %eax
-; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0
-; VEX-NEXT: retq
-;
-; AVX512F-LABEL: extract3_uitofp_v4i32_f64:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512F-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: extract3_uitofp_v4i32_f64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VL-NEXT: vcvtudq2pd %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: extract3_uitofp_v4i32_f64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512DQ-NEXT: vcvtudq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512VLDQ-LABEL: extract3_uitofp_v4i32_f64:
-; AVX512VLDQ: # %bb.0:
-; AVX512VLDQ-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512VLDQ-NEXT: vcvtudq2pd %xmm0, %xmm0
-; AVX512VLDQ-NEXT: retq
- %e = extractelement <4 x i32> %x, i32 3
- %r = uitofp i32 %e to double
- ret double %r
-}
-
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X64_WIDEN
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X86_WIDEN
define void @test_udiv7_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
; X64-LABEL: test_udiv7_v2i32:
; X86-NEXT: psrld $2, %xmm0
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
-;
-; X64_WIDEN-LABEL: test_udiv7_v2i32:
-; X64_WIDEN: # %bb.0:
-; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2
-; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2
-; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3
-; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64_WIDEN-NEXT: psubd %xmm2, %xmm0
-; X64_WIDEN-NEXT: psrld $1, %xmm0
-; X64_WIDEN-NEXT: paddd %xmm2, %xmm0
-; X64_WIDEN-NEXT: psrld $2, %xmm0
-; X64_WIDEN-NEXT: movq %xmm0, (%rsi)
-; X64_WIDEN-NEXT: retq
-;
-; X86_WIDEN-LABEL: test_udiv7_v2i32:
-; X86_WIDEN: # %bb.0:
-; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2
-; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2
-; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3
-; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3
-; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86_WIDEN-NEXT: psubd %xmm2, %xmm0
-; X86_WIDEN-NEXT: psrld $1, %xmm0
-; X86_WIDEN-NEXT: paddd %xmm2, %xmm0
-; X86_WIDEN-NEXT: psrld $2, %xmm0
-; X86_WIDEN-NEXT: movq %xmm0, (%eax)
-; X86_WIDEN-NEXT: retl
%a = load <2 x i32>, <2 x i32>* %x
%b = udiv <2 x i32> %a, <i32 7, i32 7>
store <2 x i32> %b, <2 x i32>* %y
; X86-NEXT: paddd %xmm0, %xmm1
; X86-NEXT: movq %xmm1, (%eax)
; X86-NEXT: retl
-;
-; X64_WIDEN-LABEL: test_urem7_v2i32:
-; X64_WIDEN: # %bb.0:
-; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2
-; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2
-; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3
-; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X64_WIDEN-NEXT: movdqa %xmm0, %xmm1
-; X64_WIDEN-NEXT: psubd %xmm2, %xmm1
-; X64_WIDEN-NEXT: psrld $1, %xmm1
-; X64_WIDEN-NEXT: paddd %xmm2, %xmm1
-; X64_WIDEN-NEXT: psrld $2, %xmm1
-; X64_WIDEN-NEXT: movdqa %xmm1, %xmm2
-; X64_WIDEN-NEXT: pslld $3, %xmm2
-; X64_WIDEN-NEXT: psubd %xmm2, %xmm1
-; X64_WIDEN-NEXT: paddd %xmm0, %xmm1
-; X64_WIDEN-NEXT: movq %xmm1, (%rsi)
-; X64_WIDEN-NEXT: retq
-;
-; X86_WIDEN-LABEL: test_urem7_v2i32:
-; X86_WIDEN: # %bb.0:
-; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
-; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2
-; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2
-; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3
-; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3
-; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
-; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X86_WIDEN-NEXT: movdqa %xmm0, %xmm1
-; X86_WIDEN-NEXT: psubd %xmm2, %xmm1
-; X86_WIDEN-NEXT: psrld $1, %xmm1
-; X86_WIDEN-NEXT: paddd %xmm2, %xmm1
-; X86_WIDEN-NEXT: psrld $2, %xmm1
-; X86_WIDEN-NEXT: movdqa %xmm1, %xmm2
-; X86_WIDEN-NEXT: pslld $3, %xmm2
-; X86_WIDEN-NEXT: psubd %xmm2, %xmm1
-; X86_WIDEN-NEXT: paddd %xmm0, %xmm1
-; X86_WIDEN-NEXT: movq %xmm1, (%eax)
-; X86_WIDEN-NEXT: retl
%a = load <2 x i32>, <2 x i32>* %x
%b = urem <2 x i32> %a, <i32 7, i32 7>
store <2 x i32> %b, <2 x i32>* %y
; X86-NEXT: paddd %xmm0, %xmm2
; X86-NEXT: movq %xmm2, (%eax)
; X86-NEXT: retl
-;
-; X64_WIDEN-LABEL: test_sdiv7_v2i32:
-; X64_WIDEN: # %bb.0:
-; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2
-; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2
-; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3
-; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X64_WIDEN-NEXT: pxor %xmm3, %xmm3
-; X64_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3
-; X64_WIDEN-NEXT: pand %xmm1, %xmm3
-; X64_WIDEN-NEXT: paddd %xmm0, %xmm3
-; X64_WIDEN-NEXT: psubd %xmm3, %xmm2
-; X64_WIDEN-NEXT: paddd %xmm0, %xmm2
-; X64_WIDEN-NEXT: movdqa %xmm2, %xmm0
-; X64_WIDEN-NEXT: psrld $31, %xmm0
-; X64_WIDEN-NEXT: psrad $2, %xmm2
-; X64_WIDEN-NEXT: paddd %xmm0, %xmm2
-; X64_WIDEN-NEXT: movq %xmm2, (%rsi)
-; X64_WIDEN-NEXT: retq
-;
-; X86_WIDEN-LABEL: test_sdiv7_v2i32:
-; X86_WIDEN: # %bb.0:
-; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2
-; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2
-; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3
-; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3
-; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X86_WIDEN-NEXT: pxor %xmm3, %xmm3
-; X86_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3
-; X86_WIDEN-NEXT: pand %xmm1, %xmm3
-; X86_WIDEN-NEXT: paddd %xmm0, %xmm3
-; X86_WIDEN-NEXT: psubd %xmm3, %xmm2
-; X86_WIDEN-NEXT: paddd %xmm0, %xmm2
-; X86_WIDEN-NEXT: movdqa %xmm2, %xmm0
-; X86_WIDEN-NEXT: psrld $31, %xmm0
-; X86_WIDEN-NEXT: psrad $2, %xmm2
-; X86_WIDEN-NEXT: paddd %xmm0, %xmm2
-; X86_WIDEN-NEXT: movq %xmm2, (%eax)
-; X86_WIDEN-NEXT: retl
%a = load <2 x i32>, <2 x i32>* %x
%b = sdiv <2 x i32> %a, <i32 7, i32 7>
store <2 x i32> %b, <2 x i32>* %y
; X86-NEXT: paddd %xmm0, %xmm2
; X86-NEXT: movq %xmm2, (%eax)
; X86-NEXT: retl
-;
-; X64_WIDEN-LABEL: test_srem7_v2i32:
-; X64_WIDEN: # %bb.0:
-; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X64_WIDEN-NEXT: movdqa %xmm0, %xmm2
-; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm2
-; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; X64_WIDEN-NEXT: pmuludq %xmm1, %xmm3
-; X64_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X64_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X64_WIDEN-NEXT: pxor %xmm3, %xmm3
-; X64_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3
-; X64_WIDEN-NEXT: pand %xmm1, %xmm3
-; X64_WIDEN-NEXT: paddd %xmm0, %xmm3
-; X64_WIDEN-NEXT: psubd %xmm3, %xmm2
-; X64_WIDEN-NEXT: paddd %xmm0, %xmm2
-; X64_WIDEN-NEXT: movdqa %xmm2, %xmm1
-; X64_WIDEN-NEXT: psrld $31, %xmm1
-; X64_WIDEN-NEXT: psrad $2, %xmm2
-; X64_WIDEN-NEXT: paddd %xmm1, %xmm2
-; X64_WIDEN-NEXT: movdqa %xmm2, %xmm1
-; X64_WIDEN-NEXT: pslld $3, %xmm1
-; X64_WIDEN-NEXT: psubd %xmm1, %xmm2
-; X64_WIDEN-NEXT: paddd %xmm0, %xmm2
-; X64_WIDEN-NEXT: movq %xmm2, (%rsi)
-; X64_WIDEN-NEXT: retq
-;
-; X86_WIDEN-LABEL: test_srem7_v2i32:
-; X86_WIDEN: # %bb.0:
-; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86_WIDEN-NEXT: movdqa {{.*#+}} xmm1 = [2454267027,2454267027,2454267027,2454267027]
-; X86_WIDEN-NEXT: movdqa %xmm0, %xmm2
-; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm2
-; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X86_WIDEN-NEXT: movdqa %xmm0, %xmm3
-; X86_WIDEN-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
-; X86_WIDEN-NEXT: pmuludq %xmm1, %xmm3
-; X86_WIDEN-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X86_WIDEN-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; X86_WIDEN-NEXT: pxor %xmm3, %xmm3
-; X86_WIDEN-NEXT: pcmpgtd %xmm0, %xmm3
-; X86_WIDEN-NEXT: pand %xmm1, %xmm3
-; X86_WIDEN-NEXT: paddd %xmm0, %xmm3
-; X86_WIDEN-NEXT: psubd %xmm3, %xmm2
-; X86_WIDEN-NEXT: paddd %xmm0, %xmm2
-; X86_WIDEN-NEXT: movdqa %xmm2, %xmm1
-; X86_WIDEN-NEXT: psrld $31, %xmm1
-; X86_WIDEN-NEXT: psrad $2, %xmm2
-; X86_WIDEN-NEXT: paddd %xmm1, %xmm2
-; X86_WIDEN-NEXT: movdqa %xmm2, %xmm1
-; X86_WIDEN-NEXT: pslld $3, %xmm1
-; X86_WIDEN-NEXT: psubd %xmm1, %xmm2
-; X86_WIDEN-NEXT: paddd %xmm0, %xmm2
-; X86_WIDEN-NEXT: movq %xmm2, (%eax)
-; X86_WIDEN-NEXT: retl
%a = load <2 x i32>, <2 x i32>* %x
%b = srem <2 x i32> %a, <i32 7, i32 7>
store <2 x i32> %b, <2 x i32>* %y
; X86-NEXT: psrld $3, %xmm0
; X86-NEXT: movq %xmm0, (%eax)
; X86-NEXT: retl
-;
-; X64_WIDEN-LABEL: test_udiv_pow2_v2i32:
-; X64_WIDEN: # %bb.0:
-; X64_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X64_WIDEN-NEXT: psrld $3, %xmm0
-; X64_WIDEN-NEXT: movq %xmm0, (%rsi)
-; X64_WIDEN-NEXT: retq
-;
-; X86_WIDEN-LABEL: test_udiv_pow2_v2i32:
-; X86_WIDEN: # %bb.0:
-; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86_WIDEN-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86_WIDEN-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X86_WIDEN-NEXT: psrld $3, %xmm0
-; X86_WIDEN-NEXT: movq %xmm0, (%eax)
-; X86_WIDEN-NEXT: retl
%a = load <2 x i32>, <2 x i32>* %x
%b = udiv <2 x i32> %a, <i32 8, i32 8>
store <2 x i32> %b, <2 x i32>* %y
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -x86-experimental-vector-widening-legalization | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1 immarg)
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-SLOW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1-FAST
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
-
-;
-; vXi64
-;
-
-define i64 @test_v2i64(<2 x i64> %a0) {
-; SSE-LABEL: test_v2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: paddq %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v4i64(<4 x i64> %a0) {
-; SSE-LABEL: test_v4i64:
-; SSE: # %bb.0:
-; SSE-NEXT: paddq %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: paddq %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v4i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v8i64(<8 x i64> %a0) {
-; SSE-LABEL: test_v8i64:
-; SSE: # %bb.0:
-; SSE-NEXT: paddq %xmm3, %xmm1
-; SSE-NEXT: paddq %xmm2, %xmm1
-; SSE-NEXT: paddq %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: paddq %xmm1, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v8i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v16i64(<16 x i64> %a0) {
-; SSE-LABEL: test_v16i64:
-; SSE: # %bb.0:
-; SSE-NEXT: paddq %xmm6, %xmm2
-; SSE-NEXT: paddq %xmm7, %xmm3
-; SSE-NEXT: paddq %xmm5, %xmm3
-; SSE-NEXT: paddq %xmm1, %xmm3
-; SSE-NEXT: paddq %xmm4, %xmm2
-; SSE-NEXT: paddq %xmm3, %xmm2
-; SSE-NEXT: paddq %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: paddq %xmm2, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v16i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %a0)
- ret i64 %1
-}
-
-;
-; vXi32
-;
-
-define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE-LABEL: test_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
-;
-; AVX1-SLOW-LABEL: test_v2i32:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v2i32:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE-LABEL: test_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: retq
-;
-; AVX1-SLOW-LABEL: test_v4i32:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v4i32:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE-LABEL: test_v8i32:
-; SSE: # %bb.0:
-; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: retq
-;
-; AVX1-SLOW-LABEL: test_v8i32:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: vzeroupper
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v8i32:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm0
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: vzeroupper
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v8i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE-LABEL: test_v16i32:
-; SSE: # %bb.0:
-; SSE-NEXT: paddd %xmm3, %xmm1
-; SSE-NEXT: paddd %xmm2, %xmm1
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: paddd %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
-;
-; AVX1-SLOW-LABEL: test_v16i32:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: vzeroupper
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v16i32:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: vzeroupper
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v16i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE-LABEL: test_v32i32:
-; SSE: # %bb.0:
-; SSE-NEXT: paddd %xmm6, %xmm2
-; SSE-NEXT: paddd %xmm7, %xmm3
-; SSE-NEXT: paddd %xmm5, %xmm3
-; SSE-NEXT: paddd %xmm1, %xmm3
-; SSE-NEXT: paddd %xmm4, %xmm2
-; SSE-NEXT: paddd %xmm3, %xmm2
-; SSE-NEXT: paddd %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: paddd %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: paddd %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
-;
-; AVX1-SLOW-LABEL: test_v32i32:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm4
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-SLOW-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: vzeroupper
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v32i32:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm4
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm4, %xmm2, %xmm2
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: vzeroupper
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v32i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> %a0)
- ret i32 %1
-}
-
-;
-; vXi16
-;
-
-define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE-LABEL: test_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-SLOW-LABEL: test_v2i16:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v2i16:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE-LABEL: test_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-SLOW-LABEL: test_v4i16:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v4i16:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE-LABEL: test_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-SLOW-LABEL: test_v8i16:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v8i16:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE-LABEL: test_v16i16:
-; SSE: # %bb.0:
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-SLOW-LABEL: test_v16i16:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT: vzeroupper
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v16i16:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm1, %xmm0
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT: vzeroupper
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE-LABEL: test_v32i16:
-; SSE: # %bb.0:
-; SSE-NEXT: paddw %xmm3, %xmm1
-; SSE-NEXT: paddw %xmm2, %xmm1
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-SLOW-LABEL: test_v32i16:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-SLOW-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT: vzeroupper
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v32i16:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT: vpaddw %xmm2, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT: vzeroupper
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v32i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE-LABEL: test_v64i16:
-; SSE: # %bb.0:
-; SSE-NEXT: paddw %xmm6, %xmm2
-; SSE-NEXT: paddw %xmm7, %xmm3
-; SSE-NEXT: paddw %xmm5, %xmm3
-; SSE-NEXT: paddw %xmm1, %xmm3
-; SSE-NEXT: paddw %xmm4, %xmm2
-; SSE-NEXT: paddw %xmm3, %xmm2
-; SSE-NEXT: paddw %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: paddw %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: paddw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-SLOW-LABEL: test_v64i16:
-; AVX1-SLOW: # %bb.0:
-; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm4
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm3, %xmm1, %xmm1
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm4, %xmm2, %xmm2
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm2, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX1-SLOW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-SLOW-NEXT: vzeroupper
-; AVX1-SLOW-NEXT: retq
-;
-; AVX1-FAST-LABEL: test_v64i16:
-; AVX1-FAST: # %bb.0:
-; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm4
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-FAST-NEXT: vpaddw %xmm3, %xmm1, %xmm1
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vpaddw %xmm4, %xmm2, %xmm2
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm2, %xmm1
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-FAST-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vmovd %xmm0, %eax
-; AVX1-FAST-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-FAST-NEXT: vzeroupper
-; AVX1-FAST-NEXT: retq
-;
-; AVX2-LABEL: test_v64i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> %a0)
- ret i16 %1
-}
-
-;
-; vXi8
-;
-
-define i8 @test_v2i8(<2 x i8> %a0) {
-; SSE2-LABEL: test_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: paddb %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v4i8(<4 x i8> %a0) {
-; SSE2-LABEL: test_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: paddb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: paddb %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v8i8(<8 x i8> %a0) {
-; SSE2-LABEL: test_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: paddb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: paddb %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: paddb %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v16i8(<16 x i8> %a0) {
-; SSE2-LABEL: test_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: psadbw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: paddb %xmm0, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: psadbw %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v32i8(<32 x i8> %a0) {
-; SSE2-LABEL: test_v32i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: paddb %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: psadbw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: paddb %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: paddb %xmm0, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: psadbw %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v64i8(<64 x i8> %a0) {
-; SSE2-LABEL: test_v64i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: paddb %xmm3, %xmm1
-; SSE2-NEXT: paddb %xmm2, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: paddb %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: paddb %xmm3, %xmm1
-; SSE41-NEXT: paddb %xmm2, %xmm1
-; SSE41-NEXT: paddb %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: paddb %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: psadbw %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v128i8(<128 x i8> %a0) {
-; SSE2-LABEL: test_v128i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: paddb %xmm7, %xmm3
-; SSE2-NEXT: paddb %xmm5, %xmm3
-; SSE2-NEXT: paddb %xmm1, %xmm3
-; SSE2-NEXT: paddb %xmm6, %xmm2
-; SSE2-NEXT: paddb %xmm4, %xmm2
-; SSE2-NEXT: paddb %xmm3, %xmm2
-; SSE2-NEXT: paddb %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: paddb %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: psadbw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v128i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: paddb %xmm7, %xmm3
-; SSE41-NEXT: paddb %xmm5, %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm3
-; SSE41-NEXT: paddb %xmm6, %xmm2
-; SSE41-NEXT: paddb %xmm4, %xmm2
-; SSE41-NEXT: paddb %xmm3, %xmm2
-; SSE41-NEXT: paddb %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: paddb %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: psadbw %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v128i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpaddb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpaddb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v128i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddb %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v128i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> %a0)
- ret i8 %1
-}
-
-declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
-
-declare i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32>)
-
-declare i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16>)
-
-declare i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8>)
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
-
-;
-; vXi64
-;
-
-define i64 @test_v2i64(<2 x i64> %a0) {
-; SSE-LABEL: test_v2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v4i64(<4 x i64> %a0) {
-; SSE-LABEL: test_v4i64:
-; SSE: # %bb.0:
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v4i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v8i64(<8 x i64> %a0) {
-; SSE-LABEL: test_v8i64:
-; SSE: # %bb.0:
-; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v8i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v16i64(<16 x i64> %a0) {
-; SSE-LABEL: test_v16i64:
-; SSE: # %bb.0:
-; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v16i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> %a0)
- ret i64 %1
-}
-
-;
-; vXi32
-;
-
-define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE-LABEL: test_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE-LABEL: test_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE-LABEL: test_v8i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE-LABEL: test_v16i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v16i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE-LABEL: test_v32i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v32i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> %a0)
- ret i32 %1
-}
-
-;
-; vXi16
-;
-
-define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE-LABEL: test_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE-LABEL: test_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE-LABEL: test_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE-LABEL: test_v16i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v16i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE-LABEL: test_v32i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v32i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE-LABEL: test_v64i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm5, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm3, %xmm2
-; SSE-NEXT: pand %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pand %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v64i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> %a0)
- ret i16 %1
-}
-
-;
-; vXi8
-;
-
-define i8 @test_v2i8(<2 x i8> %a0) {
-; SSE2-LABEL: test_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v4i8(<4 x i8> %a0) {
-; SSE2-LABEL: test_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v8i8(<8 x i8> %a0) {
-; SSE2-LABEL: test_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v16i8(<16 x i8> %a0) {
-; SSE2-LABEL: test_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v32i8(<32 x i8> %a0) {
-; SSE2-LABEL: test_v32i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v64i8(<64 x i8> %a0) {
-; SSE2-LABEL: test_v64i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pand %xmm3, %xmm1
-; SSE41-NEXT: pand %xmm2, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v128i8(<128 x i8> %a0) {
-; SSE2-LABEL: test_v128i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v128i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pand %xmm6, %xmm2
-; SSE41-NEXT: pand %xmm7, %xmm3
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: pand %xmm1, %xmm3
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: pand %xmm3, %xmm2
-; SSE41-NEXT: pand %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pand %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v128i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v128i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v128i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> %a0)
- ret i8 %1
-}
-
-declare i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64>)
-
-declare i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32>)
-
-declare i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16>)
-
-declare i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8>)
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512BWVL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512DQVL
-
-;
-; vXi64
-;
-
-define i64 @test_v2i64(<2 x i64> %a0) {
-; SSE-LABEL: test_v2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: pmuludq %xmm0, %xmm3
-; SSE-NEXT: paddq %xmm2, %xmm3
-; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: paddq %xmm3, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v2i64:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vmovq %xmm0, %rax
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v2i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovq %xmm0, %rax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v2i64:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vmovq %xmm0, %rax
-; AVX512DQVL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v4i64(<4 x i64> %a0) {
-; SSE-LABEL: test_v4i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm0, %xmm3
-; SSE-NEXT: paddq %xmm2, %xmm3
-; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: paddq %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: pmuludq %xmm0, %xmm3
-; SSE-NEXT: paddq %xmm2, %xmm3
-; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: paddq %xmm3, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v4i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v4i64:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vmovq %xmm0, %rax
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v4i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovq %xmm0, %rax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v4i64:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vmovq %xmm0, %rax
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v8i64(<8 x i64> %a0) {
-; SSE-LABEL: test_v8i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm3, %xmm4
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm1, %xmm5
-; SSE-NEXT: paddq %xmm4, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm3, %xmm1
-; SSE-NEXT: paddq %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm0, %xmm4
-; SSE-NEXT: paddq %xmm3, %xmm4
-; SSE-NEXT: psllq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: paddq %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm0, %xmm3
-; SSE-NEXT: paddq %xmm2, %xmm3
-; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: paddq %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: pmuludq %xmm0, %xmm3
-; SSE-NEXT: paddq %xmm2, %xmm3
-; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: paddq %xmm3, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v8i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4
-; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
-; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
-; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
-; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4
-; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1
-; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm3
-; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v8i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v8i64:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vmovq %xmm0, %rax
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v8i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovq %xmm0, %rax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v8i64:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vmovq %xmm0, %rax
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v16i64(<16 x i64> %a0) {
-; SSE-LABEL: test_v16i64:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm2, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm6, %xmm8
-; SSE-NEXT: movdqa %xmm6, %xmm9
-; SSE-NEXT: psrlq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm2, %xmm9
-; SSE-NEXT: paddq %xmm8, %xmm9
-; SSE-NEXT: psllq $32, %xmm9
-; SSE-NEXT: pmuludq %xmm6, %xmm2
-; SSE-NEXT: paddq %xmm9, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm8
-; SSE-NEXT: psrlq $32, %xmm8
-; SSE-NEXT: pmuludq %xmm4, %xmm8
-; SSE-NEXT: movdqa %xmm4, %xmm6
-; SSE-NEXT: psrlq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm0, %xmm6
-; SSE-NEXT: paddq %xmm8, %xmm6
-; SSE-NEXT: psllq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm4, %xmm0
-; SSE-NEXT: paddq %xmm6, %xmm0
-; SSE-NEXT: movdqa %xmm3, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm7, %xmm4
-; SSE-NEXT: movdqa %xmm7, %xmm6
-; SSE-NEXT: psrlq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm3, %xmm6
-; SSE-NEXT: paddq %xmm4, %xmm6
-; SSE-NEXT: psllq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm7, %xmm3
-; SSE-NEXT: paddq %xmm6, %xmm3
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm5, %xmm4
-; SSE-NEXT: movdqa %xmm5, %xmm6
-; SSE-NEXT: psrlq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm1, %xmm6
-; SSE-NEXT: paddq %xmm4, %xmm6
-; SSE-NEXT: psllq $32, %xmm6
-; SSE-NEXT: pmuludq %xmm5, %xmm1
-; SSE-NEXT: paddq %xmm6, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm3, %xmm4
-; SSE-NEXT: movdqa %xmm3, %xmm5
-; SSE-NEXT: psrlq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm1, %xmm5
-; SSE-NEXT: paddq %xmm4, %xmm5
-; SSE-NEXT: psllq $32, %xmm5
-; SSE-NEXT: pmuludq %xmm3, %xmm1
-; SSE-NEXT: paddq %xmm5, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: movdqa %xmm2, %xmm4
-; SSE-NEXT: psrlq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm0, %xmm4
-; SSE-NEXT: paddq %xmm3, %xmm4
-; SSE-NEXT: psllq $32, %xmm4
-; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: paddq %xmm4, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm1, %xmm3
-; SSE-NEXT: psrlq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm0, %xmm3
-; SSE-NEXT: paddq %xmm2, %xmm3
-; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: paddq %xmm3, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: psrlq $32, %xmm2
-; SSE-NEXT: pmuludq %xmm1, %xmm2
-; SSE-NEXT: movdqa %xmm0, %xmm3
-; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT: pmuludq %xmm0, %xmm3
-; SSE-NEXT: paddq %xmm2, %xmm3
-; SSE-NEXT: psllq $32, %xmm3
-; SSE-NEXT: pmuludq %xmm1, %xmm0
-; SSE-NEXT: paddq %xmm3, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v16i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4
-; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5
-; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5
-; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4
-; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5
-; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5
-; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
-; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6
-; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm6
-; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6
-; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6
-; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm7
-; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm7
-; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6
-; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
-; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6
-; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6
-; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm1
-; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm2
-; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm2
-; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2
-; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3
-; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4
-; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4
-; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5
-; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5
-; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4
-; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3
-; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
-; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4
-; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3
-; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2
-; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2
-; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v16i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v16i64:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3
-; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX512BWVL-NEXT: vpmuludq %xmm3, %xmm0, %xmm3
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX512BWVL-NEXT: vpsllq $32, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vmovq %xmm0, %rax
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v16i64:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vmovq %xmm0, %rax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v16i64:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQVL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT: vpmullq %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vmovq %xmm0, %rax
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> %a0)
- ret i64 %1
-}
-
-;
-; vXi32
-;
-
-define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE2-LABEL: test_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE2-LABEL: test_v4i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm3
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE2-LABEL: test_v8i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm3
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,0,0]
-; SSE2-NEXT: pmuludq %xmm3, %xmm0
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE2-LABEL: test_v16i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm5, %xmm2
-; SSE2-NEXT: pmuludq %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,0,0]
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld %xmm3, %xmm1
-; SSE41-NEXT: pmulld %xmm2, %xmm1
-; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE2-LABEL: test_v32i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm6[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm2[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm8, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm4[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm8, %xmm10
-; SSE2-NEXT: pmuludq %xmm9, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm3[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm8, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm8, %xmm11
-; SSE2-NEXT: pmuludq %xmm9, %xmm11
-; SSE2-NEXT: pmuludq %xmm10, %xmm11
-; SSE2-NEXT: pmuludq %xmm6, %xmm2
-; SSE2-NEXT: pmuludq %xmm4, %xmm0
-; SSE2-NEXT: pmuludq %xmm2, %xmm0
-; SSE2-NEXT: pmuludq %xmm7, %xmm3
-; SSE2-NEXT: pmuludq %xmm5, %xmm1
-; SSE2-NEXT: pmuludq %xmm3, %xmm1
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[2,2,0,0]
-; SSE2-NEXT: pmuludq %xmm11, %xmm1
-; SSE2-NEXT: pmuludq %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmulld %xmm6, %xmm2
-; SSE41-NEXT: pmulld %xmm7, %xmm3
-; SSE41-NEXT: pmulld %xmm5, %xmm3
-; SSE41-NEXT: pmulld %xmm1, %xmm3
-; SSE41-NEXT: pmulld %xmm4, %xmm2
-; SSE41-NEXT: pmulld %xmm3, %xmm2
-; SSE41-NEXT: pmulld %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: pmulld %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmulld %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> %a0)
- ret i32 %1
-}
-
-;
-; vXi16
-;
-
-define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE-LABEL: test_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE-LABEL: test_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE-LABEL: test_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE-LABEL: test_v16i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v16i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE-LABEL: test_v32i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pmullw %xmm3, %xmm1
-; SSE-NEXT: pmullw %xmm2, %xmm1
-; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v32i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v32i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v32i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vmovd %xmm0, %eax
-; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v32i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovd %xmm0, %eax
-; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v32i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vmovd %xmm0, %eax
-; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE-LABEL: test_v64i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pmullw %xmm6, %xmm2
-; SSE-NEXT: pmullw %xmm7, %xmm3
-; SSE-NEXT: pmullw %xmm5, %xmm3
-; SSE-NEXT: pmullw %xmm1, %xmm3
-; SSE-NEXT: pmullw %xmm4, %xmm2
-; SSE-NEXT: pmullw %xmm3, %xmm2
-; SSE-NEXT: pmullw %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: pmullw %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pmullw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v64i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpmullw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v64i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v64i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vmovd %xmm0, %eax
-; AVX512BWVL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v64i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovd %xmm0, %eax
-; AVX512DQ-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v64i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vmovd %xmm0, %eax
-; AVX512DQVL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> %a0)
- ret i16 %1
-}
-
-;
-; vXi8
-;
-
-define i8 @test_v2i8(<2 x i8> %a0) {
-; SSE2-LABEL: test_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pmullw %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v4i8(<4 x i8> %a0) {
-; SSE2-LABEL: test_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmullw %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero
-; SSE41-NEXT: pmullw %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmullw %xmm0, %xmm1, %xmm0
-; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v8i8(<8 x i8> %a0) {
-; SSE2-LABEL: test_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,2,3,0]
-; SSE2-NEXT: pmullw %xmm1, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmullw %xmm1, %xmm0
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmullw %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,xmm1[6],zero,xmm1[10],zero,xmm1[14],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero
-; SSE41-NEXT: pmullw %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
-; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512BW-LABEL: test_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmullw %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmullw %xmm0, %xmm1, %xmm0
-; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm0[4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3]
-; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
-; AVX512VL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512VL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v8i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmullw %xmm0, %xmm1, %xmm0
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2],zero,xmm0[6],zero,xmm0[10],zero,xmm0[14],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u],zero
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
-; AVX512DQ-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v16i8(<16 x i8> %a0) {
-; SSE2-LABEL: test_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pmullw %xmm1, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: packuswb %xmm2, %xmm0
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmullw %xmm3, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: packuswb %xmm2, %xmm0
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmullw %xmm3, %xmm0
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: packuswb %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pmullw %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[4],zero,xmm0[6],zero,xmm0[4],zero,xmm0[6],zero,xmm0[8],zero,xmm0[10],zero,xmm0[12],zero,xmm0[14],zero
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7]
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v16i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v16i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512DQVL-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQVL-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v32i8(<32 x i8> %a0) {
-; SSE2-LABEL: test_v32i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT: pmullw %xmm2, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm3, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm1, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pmullw %xmm2, %xmm3
-; SSE41-NEXT: pshufb {{.*#+}} xmm3 = xmm3[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: pmullw %xmm1, %xmm3
-; SSE41-NEXT: pmullw %xmm0, %xmm3
-; SSE41-NEXT: pand %xmm2, %xmm3
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: packuswb %xmm0, %xmm3
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE41-NEXT: pmullw %xmm1, %xmm3
-; SSE41-NEXT: pand %xmm2, %xmm3
-; SSE41-NEXT: packuswb %xmm0, %xmm3
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; SSE41-NEXT: pmullw %xmm1, %xmm3
-; SSE41-NEXT: pand %xmm2, %xmm3
-; SSE41-NEXT: packuswb %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmullw %xmm3, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v32i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BW-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v32i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BWVL-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm1
-; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
-; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v32i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v32i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v64i8(<64 x i8> %a0) {
-; SSE2-LABEL: test_v64i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT: pmullw %xmm3, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT: pmullw %xmm1, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pmullw %xmm2, %xmm0
-; SSE2-NEXT: pmullw %xmm4, %xmm0
-; SSE2-NEXT: pmullw %xmm5, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: packuswb %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE41-NEXT: pmullw %xmm3, %xmm1
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT: pmullw %xmm1, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pmullw %xmm3, %xmm6
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE41-NEXT: pshufb %xmm3, %xmm6
-; SSE41-NEXT: pmullw %xmm4, %xmm5
-; SSE41-NEXT: pshufb %xmm3, %xmm5
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; SSE41-NEXT: pmullw %xmm4, %xmm5
-; SSE41-NEXT: pshufb %xmm3, %xmm5
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; SSE41-NEXT: pmullw %xmm2, %xmm3
-; SSE41-NEXT: pmullw %xmm0, %xmm3
-; SSE41-NEXT: pand %xmm1, %xmm3
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: packuswb %xmm0, %xmm3
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
-; SSE41-NEXT: pmullw %xmm2, %xmm3
-; SSE41-NEXT: pand %xmm1, %xmm3
-; SSE41-NEXT: packuswb %xmm0, %xmm3
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
-; SSE41-NEXT: pmullw %xmm2, %xmm3
-; SSE41-NEXT: pand %xmm1, %xmm3
-; SSE41-NEXT: packuswb %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmullw %xmm3, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpmullw %xmm3, %xmm5, %xmm3
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT: vpmullw %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v64i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vpmullw %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
-; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512BW-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512BW-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v64i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BWVL-NEXT: vpmullw %zmm3, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
-; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512BWVL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v64i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQ-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v64i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQVL-NEXT: vpmullw %ymm2, %ymm3, %ymm2
-; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQVL-NEXT: vpand %ymm3, %ymm2, %ymm2
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpand %ymm3, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v128i8(<128 x i8> %a0) {
-; SSE2-LABEL: test_v128i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm6, %xmm8
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
-; SSE2-NEXT: pmullw %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm4, %xmm10
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15]
-; SSE2-NEXT: pmullw %xmm9, %xmm10
-; SSE2-NEXT: movdqa %xmm0, %xmm9
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm7, %xmm8
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm11
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
-; SSE2-NEXT: pmullw %xmm8, %xmm11
-; SSE2-NEXT: movdqa %xmm5, %xmm12
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm0[8],xmm12[9],xmm0[9],xmm12[10],xmm0[10],xmm12[11],xmm0[11],xmm12[12],xmm0[12],xmm12[13],xmm0[13],xmm12[14],xmm0[14],xmm12[15],xmm0[15]
-; SSE2-NEXT: pmullw %xmm11, %xmm12
-; SSE2-NEXT: movdqa %xmm1, %xmm8
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
-; SSE2-NEXT: pmullw %xmm12, %xmm8
-; SSE2-NEXT: pmullw %xmm10, %xmm8
-; SSE2-NEXT: pmullw %xmm9, %xmm8
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm6, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm2, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm7, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm3, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm5, %xmm1
-; SSE2-NEXT: pmullw %xmm4, %xmm1
-; SSE2-NEXT: pmullw %xmm8, %xmm1
-; SSE2-NEXT: pmullw %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,2,3,3]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm3, %xmm1
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: pmullw %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: pmullw %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: packuswb %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v128i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE41-NEXT: pmullw %xmm6, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; SSE41-NEXT: pmullw %xmm2, %xmm4
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; SSE41-NEXT: pmullw %xmm7, %xmm3
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE41-NEXT: pmullw %xmm3, %xmm5
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE41-NEXT: pmullw %xmm5, %xmm1
-; SSE41-NEXT: pmullw %xmm4, %xmm1
-; SSE41-NEXT: pmullw %xmm7, %xmm3
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE41-NEXT: pshufb %xmm5, %xmm3
-; SSE41-NEXT: pmullw %xmm11, %xmm6
-; SSE41-NEXT: pshufb %xmm5, %xmm6
-; SSE41-NEXT: pmullw %xmm10, %xmm2
-; SSE41-NEXT: pshufb %xmm5, %xmm2
-; SSE41-NEXT: pmullw %xmm8, %xmm9
-; SSE41-NEXT: pshufb %xmm5, %xmm9
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT: pmullw %xmm7, %xmm2
-; SSE41-NEXT: pshufb %xmm5, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: pmullw %xmm6, %xmm3
-; SSE41-NEXT: pshufb %xmm5, %xmm3
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT: pmullw %xmm3, %xmm2
-; SSE41-NEXT: pshufb %xmm5, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT: pmullw %xmm1, %xmm2
-; SSE41-NEXT: pmullw %xmm0, %xmm2
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: packuswb %xmm0, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
-; SSE41-NEXT: pmullw %xmm1, %xmm2
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: packuswb %xmm0, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; SSE41-NEXT: pmullw %xmm1, %xmm2
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: packuswb %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmullw %xmm2, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v128i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; AVX1-NEXT: vpmullw %xmm4, %xmm5, %xmm8
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
-; AVX1-NEXT: vpmullw %xmm7, %xmm5, %xmm10
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
-; AVX1-NEXT: vpmullw %xmm10, %xmm5, %xmm10
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; AVX1-NEXT: vpmullw %xmm10, %xmm6, %xmm6
-; AVX1-NEXT: vpmullw %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vpmullw %xmm6, %xmm9, %xmm6
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
-; AVX1-NEXT: vpmullw %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
-; AVX1-NEXT: vpmullw %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm3
-; AVX1-NEXT: vpand %xmm2, %xmm5, %xmm4
-; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3
-; AVX1-NEXT: vpmullw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmullw %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmullw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v128i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
-; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX2-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX2-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; AVX2-NEXT: vpmullw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v128i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BW-NEXT: vpmullw %zmm2, %zmm3, %zmm2
-; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm1
-; AVX512BW-NEXT: vpackuswb %zmm4, %zmm1, %zmm1
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BW-NEXT: vpmullw %zmm4, %zmm2, %zmm2
-; AVX512BW-NEXT: vpandq %zmm3, %zmm2, %zmm2
-; AVX512BW-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
-; AVX512BW-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm0
-; AVX512BW-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512BW-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512BW-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512BW-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512BW-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512BW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512BW-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: test_v128i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
-; AVX512BWVL-NEXT: vpmullw %zmm2, %zmm3, %zmm2
-; AVX512BWVL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm4
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
-; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm1
-; AVX512BWVL-NEXT: vpackuswb %zmm4, %zmm1, %zmm1
-; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm1, %ymm1
-; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} zmm4 = zmm1[8],zmm0[8],zmm1[9],zmm0[9],zmm1[10],zmm0[10],zmm1[11],zmm0[11],zmm1[12],zmm0[12],zmm1[13],zmm0[13],zmm1[14],zmm0[14],zmm1[15],zmm0[15],zmm1[24],zmm0[24],zmm1[25],zmm0[25],zmm1[26],zmm0[26],zmm1[27],zmm0[27],zmm1[28],zmm0[28],zmm1[29],zmm0[29],zmm1[30],zmm0[30],zmm1[31],zmm0[31],zmm1[40],zmm0[40],zmm1[41],zmm0[41],zmm1[42],zmm0[42],zmm1[43],zmm0[43],zmm1[44],zmm0[44],zmm1[45],zmm0[45],zmm1[46],zmm0[46],zmm1[47],zmm0[47],zmm1[56],zmm0[56],zmm1[57],zmm0[57],zmm1[58],zmm0[58],zmm1[59],zmm0[59],zmm1[60],zmm0[60],zmm1[61],zmm0[61],zmm1[62],zmm0[62],zmm1[63],zmm0[63]
-; AVX512BWVL-NEXT: vpmullw %zmm4, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpandq %zmm3, %zmm2, %zmm2
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm0[0],zmm1[1],zmm0[1],zmm1[2],zmm0[2],zmm1[3],zmm0[3],zmm1[4],zmm0[4],zmm1[5],zmm0[5],zmm1[6],zmm0[6],zmm1[7],zmm0[7],zmm1[16],zmm0[16],zmm1[17],zmm0[17],zmm1[18],zmm0[18],zmm1[19],zmm0[19],zmm1[20],zmm0[20],zmm1[21],zmm0[21],zmm1[22],zmm0[22],zmm1[23],zmm0[23],zmm1[32],zmm0[32],zmm1[33],zmm0[33],zmm1[34],zmm0[34],zmm1[35],zmm0[35],zmm1[36],zmm0[36],zmm1[37],zmm0[37],zmm1[38],zmm0[38],zmm1[39],zmm0[39],zmm1[48],zmm0[48],zmm1[49],zmm0[49],zmm1[50],zmm0[50],zmm1[51],zmm0[51],zmm1[52],zmm0[52],zmm1[53],zmm0[53],zmm1[54],zmm0[54],zmm1[55],zmm0[55]
-; AVX512BWVL-NEXT: vpmullw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpandq %zmm3, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512BWVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512BWVL-NEXT: vpmullw %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512BWVL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512BWVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512BWVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512BWVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BWVL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; AVX512DQ-LABEL: test_v128i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
-; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512DQ-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512DQ-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512DQ-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
-; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512DQ-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512DQ-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512DQ-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQ-NEXT: # kill: def $al killed $al killed $eax
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512DQVL-LABEL: test_v128i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23]
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23]
-; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512DQVL-NEXT: vpmullw %ymm4, %ymm5, %ymm4
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31]
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512DQVL-NEXT: vpmullw %ymm3, %ymm1, %ymm1
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31]
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm2, %ymm1
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
-; AVX512DQVL-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512DQVL-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpand %ymm1, %ymm4, %ymm1
-; AVX512DQVL-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
-; AVX512DQVL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; AVX512DQVL-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm3, %xmm2
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512DQVL-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX512DQVL-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpand %xmm1, %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpackuswb %xmm3, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX512DQVL-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512DQVL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> %a0)
- ret i8 %1
-}
-
-declare i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64>)
-
-declare i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32>)
-
-declare i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16>)
-
-declare i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8>)
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
-
-;
-; vXi64
-;
-
-define i64 @test_v2i64(<2 x i64> %a0) {
-; SSE-LABEL: test_v2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v4i64(<4 x i64> %a0) {
-; SSE-LABEL: test_v4i64:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v4i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v8i64(<8 x i64> %a0) {
-; SSE-LABEL: test_v8i64:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v8i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v16i64(<16 x i64> %a0) {
-; SSE-LABEL: test_v16i64:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: por %xmm2, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v16i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> %a0)
- ret i64 %1
-}
-
-;
-; vXi32
-;
-
-define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE-LABEL: test_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE-LABEL: test_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE-LABEL: test_v8i32:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE-LABEL: test_v16i32:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v16i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE-LABEL: test_v32i32:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: por %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v32i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> %a0)
- ret i32 %1
-}
-
-;
-; vXi16
-;
-
-define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE-LABEL: test_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE-LABEL: test_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE-LABEL: test_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE-LABEL: test_v16i16:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v16i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE-LABEL: test_v32i16:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: por %xmm2, %xmm1
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v32i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE-LABEL: test_v64i16:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: por %xmm5, %xmm3
-; SSE-NEXT: por %xmm1, %xmm3
-; SSE-NEXT: por %xmm4, %xmm2
-; SSE-NEXT: por %xmm3, %xmm2
-; SSE-NEXT: por %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: por %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: por %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: por %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v64i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> %a0)
- ret i16 %1
-}
-
-;
-; vXi8
-;
-
-define i8 @test_v2i8(<2 x i8> %a0) {
-; SSE2-LABEL: test_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v4i8(<4 x i8> %a0) {
-; SSE2-LABEL: test_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v8i8(<8 x i8> %a0) {
-; SSE2-LABEL: test_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v16i8(<16 x i8> %a0) {
-; SSE2-LABEL: test_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v32i8(<32 x i8> %a0) {
-; SSE2-LABEL: test_v32i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v64i8(<64 x i8> %a0) {
-; SSE2-LABEL: test_v64i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: por %xmm3, %xmm1
-; SSE41-NEXT: por %xmm2, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v128i8(<128 x i8> %a0) {
-; SSE2-LABEL: test_v128i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v128i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: por %xmm6, %xmm2
-; SSE41-NEXT: por %xmm7, %xmm3
-; SSE41-NEXT: por %xmm5, %xmm3
-; SSE41-NEXT: por %xmm1, %xmm3
-; SSE41-NEXT: por %xmm4, %xmm2
-; SSE41-NEXT: por %xmm3, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: por %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v128i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v128i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v128i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> %a0)
- ret i8 %1
-}
-
-declare i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64>)
-
-declare i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32>)
-
-declare i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16>)
-
-declare i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8>)
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
-
-;
-; vXi64
-;
-
-define i64 @test_v2i64(<2 x i64> %a0) {
-; SSE2-LABEL: test_v2i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movq %xmm2, %rax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v4i64(<4 x i64> %a0) {
-; SSE2-LABEL: test_v4i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm3, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movq %xmm2, %rax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v4i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v4i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v8i64(<8 x i64> %a0) {
-; SSE2-LABEL: test_v8i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: xorpd %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: xorpd %xmm5, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v8i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v8i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v8i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v16i64(<16 x i64> %a0) {
-; SSE2-LABEL: test_v16i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: movdqa %xmm10, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm10, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm9
-; SSE2-NEXT: por %xmm1, %xmm9
-; SSE2-NEXT: movdqa %xmm7, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm4, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm9
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm9, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm8
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm10
-; SSE41-NEXT: pxor %xmm9, %xmm10
-; SSE41-NEXT: movdqa %xmm10, %xmm11
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm11
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm11, %xmm0
-; SSE41-NEXT: por %xmm10, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm1
-; SSE41-NEXT: pxor %xmm9, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm10
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: pxor %xmm9, %xmm1
-; SSE41-NEXT: movdqa %xmm8, %xmm3
-; SSE41-NEXT: pxor %xmm9, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm9, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
-; SSE41-NEXT: movapd %xmm6, %xmm0
-; SSE41-NEXT: xorpd %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm4, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
-; SSE41-NEXT: movapd %xmm7, %xmm0
-; SSE41-NEXT: xorpd %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm5, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
-; SSE41-NEXT: movapd %xmm7, %xmm0
-; SSE41-NEXT: xorpd %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm6, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm7, %xmm2
-; SSE41-NEXT: pxor %xmm9, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm9
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm11, %xmm5, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm9
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm10
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm7, %xmm3
-; AVX1-NEXT: vblendvpd %xmm8, %xmm5, %xmm11, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v16i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v16i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64> %a0)
- ret i64 %1
-}
-
-;
-; vXi32
-;
-
-define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE2-LABEL: test_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxsd %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE2-LABEL: test_v4i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmaxsd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pmaxsd %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE2-LABEL: test_v8i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxsd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmaxsd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pmaxsd %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE2-LABEL: test_v16i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm4, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxsd %xmm3, %xmm1
-; SSE41-NEXT: pmaxsd %xmm2, %xmm1
-; SSE41-NEXT: pmaxsd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pmaxsd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxsd %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmaxsd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxsd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE2-LABEL: test_v32i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm2, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm8
-; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm8
-; SSE2-NEXT: por %xmm2, %xmm8
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxsd %xmm6, %xmm2
-; SSE41-NEXT: pmaxsd %xmm7, %xmm3
-; SSE41-NEXT: pmaxsd %xmm5, %xmm3
-; SSE41-NEXT: pmaxsd %xmm1, %xmm3
-; SSE41-NEXT: pmaxsd %xmm4, %xmm2
-; SSE41-NEXT: pmaxsd %xmm3, %xmm2
-; SSE41-NEXT: pmaxsd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: pmaxsd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxsd %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpmaxsd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpmaxsd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmaxsd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmaxsd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32> %a0)
- ret i32 %1
-}
-
-;
-; vXi16
-;
-
-define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE-LABEL: test_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pmaxsw %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE-LABEL: test_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pmaxsw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pmaxsw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE2-LABEL: test_v8i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vphminposuw %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE2-LABEL: test_v16i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxsw %xmm1, %xmm0
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE2-LABEL: test_v32i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; SSE2-NEXT: pmaxsw %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxsw %xmm3, %xmm1
-; SSE41-NEXT: pmaxsw %xmm2, %xmm1
-; SSE41-NEXT: pmaxsw %xmm0, %xmm1
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmaxsw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxsw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE2-LABEL: test_v64i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pmaxsw %xmm6, %xmm2
-; SSE2-NEXT: pmaxsw %xmm7, %xmm3
-; SSE2-NEXT: pmaxsw %xmm5, %xmm3
-; SSE2-NEXT: pmaxsw %xmm1, %xmm3
-; SSE2-NEXT: pmaxsw %xmm4, %xmm2
-; SSE2-NEXT: pmaxsw %xmm3, %xmm2
-; SSE2-NEXT: pmaxsw %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxsw %xmm7, %xmm3
-; SSE41-NEXT: pmaxsw %xmm5, %xmm3
-; SSE41-NEXT: pmaxsw %xmm1, %xmm3
-; SSE41-NEXT: pmaxsw %xmm6, %xmm2
-; SSE41-NEXT: pmaxsw %xmm4, %xmm2
-; SSE41-NEXT: pmaxsw %xmm3, %xmm2
-; SSE41-NEXT: pmaxsw %xmm0, %xmm2
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2
-; SSE41-NEXT: phminposuw %xmm2, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpmaxsw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmaxsw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxsw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaxsw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmaxsw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmaxsw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: xorl $32767, %eax # imm = 0x7FFF
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16> %a0)
- ret i16 %1
-}
-
-;
-; vXi8
-;
-
-define i8 @test_v2i8(<2 x i8> %a0) {
-; SSE2-LABEL: test_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pmaxsb %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v4i8(<4 x i8> %a0) {
-; SSE2-LABEL: test_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pmaxsb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmaxsb %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v8i8(<8 x i8> %a0) {
-; SSE2-LABEL: test_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxsb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmaxsb %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pmaxsb %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v16i8(<16 x i8> %a0) {
-; SSE2-LABEL: test_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: xorb $127, %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vphminposuw %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: xorb $127, %al
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: xorb $127, %al
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v32i8(<32 x i8> %a0) {
-; SSE2-LABEL: test_v32i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxsb %xmm1, %xmm0
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: xorb $127, %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: xorb $127, %al
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: xorb $127, %al
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: xorb $127, %al
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v64i8(<64 x i8> %a0) {
-; SSE2-LABEL: test_v64i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pcmpgtb %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtb %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm4, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxsb %xmm3, %xmm1
-; SSE41-NEXT: pmaxsb %xmm2, %xmm1
-; SSE41-NEXT: pmaxsb %xmm0, %xmm1
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pminub %xmm1, %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: xorb $127, %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmaxsb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxsb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: xorb $127, %al
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: xorb $127, %al
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: xorb $127, %al
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v128i8(<128 x i8> %a0) {
-; SSE2-LABEL: test_v128i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm2, %xmm8
-; SSE2-NEXT: pcmpgtb %xmm6, %xmm8
-; SSE2-NEXT: pand %xmm8, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm8
-; SSE2-NEXT: por %xmm2, %xmm8
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pcmpgtb %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm5, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtb %xmm8, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v128i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxsb %xmm7, %xmm3
-; SSE41-NEXT: pmaxsb %xmm5, %xmm3
-; SSE41-NEXT: pmaxsb %xmm1, %xmm3
-; SSE41-NEXT: pmaxsb %xmm6, %xmm2
-; SSE41-NEXT: pmaxsb %xmm4, %xmm2
-; SSE41-NEXT: pmaxsb %xmm3, %xmm2
-; SSE41-NEXT: pmaxsb %xmm0, %xmm2
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pminub %xmm2, %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: xorb $127, %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v128i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpmaxsb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmaxsb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxsb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaxsb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: xorb $127, %al
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v128i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmaxsb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmaxsb %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: xorb $127, %al
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v128i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: xorb $127, %al
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8> %a0)
- ret i8 %1
-}
-
-declare i64 @llvm.experimental.vector.reduce.smax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smax.v16i64(<16 x i64>)
-
-declare i32 @llvm.experimental.vector.reduce.smax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smax.v32i32(<32 x i32>)
-
-declare i16 @llvm.experimental.vector.reduce.smax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smax.v64i16(<64 x i16>)
-
-declare i8 @llvm.experimental.vector.reduce.smax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smax.v128i8(<128 x i8>)
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
-
-;
-; vXi64
-;
-
-define i64 @test_v2i64(<2 x i64> %a0) {
-; SSE2-LABEL: test_v2i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movq %xmm2, %rax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v4i64(<4 x i64> %a0) {
-; SSE2-LABEL: test_v4i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movq %xmm2, %rax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v4i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v4i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v8i64(<8 x i64> %a0) {
-; SSE2-LABEL: test_v8i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm5, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: xorpd %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm1
-; SSE41-NEXT: xorpd %xmm5, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v8i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v8i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v8i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v16i64(<16 x i64> %a0) {
-; SSE2-LABEL: test_v16i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm6, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: movdqa %xmm10, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm10, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm9
-; SSE2-NEXT: por %xmm2, %xmm9
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm7, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm9, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm8
-; SSE2-NEXT: movdqa %xmm8, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm6, %xmm10
-; SSE41-NEXT: pxor %xmm9, %xmm10
-; SSE41-NEXT: movdqa %xmm10, %xmm11
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm11
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm11, %xmm0
-; SSE41-NEXT: por %xmm10, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pxor %xmm9, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm10
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm2
-; SSE41-NEXT: pxor %xmm9, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm8
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm9, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movapd %xmm5, %xmm0
-; SSE41-NEXT: xorpd %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: xorpd %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm6, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
-; SSE41-NEXT: movapd %xmm6, %xmm0
-; SSE41-NEXT: xorpd %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm9
-; SSE41-NEXT: movdqa %xmm9, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm9, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm9
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm11
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm11, %xmm7, %xmm10
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vblendvpd %xmm10, %xmm11, %xmm7, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6
-; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vblendvpd %xmm8, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v16i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v16i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpminsq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64> %a0)
- ret i64 %1
-}
-
-;
-; vXi32
-;
-
-define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE2-LABEL: test_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminsd %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE2-LABEL: test_v4i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pminsd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pminsd %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE2-LABEL: test_v8i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminsd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pminsd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pminsd %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE2-LABEL: test_v16i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminsd %xmm3, %xmm1
-; SSE41-NEXT: pminsd %xmm2, %xmm1
-; SSE41-NEXT: pminsd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pminsd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminsd %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpminsd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminsd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE2-LABEL: test_v32i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm5, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm8
-; SSE2-NEXT: pand %xmm8, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm8
-; SSE2-NEXT: por %xmm1, %xmm8
-; SSE2-NEXT: movdqa %xmm7, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm8
-; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: por %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminsd %xmm6, %xmm2
-; SSE41-NEXT: pminsd %xmm7, %xmm3
-; SSE41-NEXT: pminsd %xmm5, %xmm3
-; SSE41-NEXT: pminsd %xmm1, %xmm3
-; SSE41-NEXT: pminsd %xmm4, %xmm2
-; SSE41-NEXT: pminsd %xmm3, %xmm2
-; SSE41-NEXT: pminsd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: pminsd %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminsd %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpminsd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpminsd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpminsd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpminsd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminsd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpminsd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpminsd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32> %a0)
- ret i32 %1
-}
-
-;
-; vXi16
-;
-
-define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE-LABEL: test_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pminsw %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE-LABEL: test_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pminsw %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pminsw %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE2-LABEL: test_v8i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vphminposuw %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: xorl $32768, %eax # imm = 0x8000
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE2-LABEL: test_v16i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminsw %xmm1, %xmm0
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: xorl $32768, %eax # imm = 0x8000
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE2-LABEL: test_v32i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pminsw %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminsw %xmm3, %xmm1
-; SSE41-NEXT: pminsw %xmm2, %xmm1
-; SSE41-NEXT: pminsw %xmm0, %xmm1
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpminsw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminsw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: xorl $32768, %eax # imm = 0x8000
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE2-LABEL: test_v64i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pminsw %xmm6, %xmm2
-; SSE2-NEXT: pminsw %xmm7, %xmm3
-; SSE2-NEXT: pminsw %xmm5, %xmm3
-; SSE2-NEXT: pminsw %xmm1, %xmm3
-; SSE2-NEXT: pminsw %xmm4, %xmm2
-; SSE2-NEXT: pminsw %xmm3, %xmm2
-; SSE2-NEXT: pminsw %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: pminsw %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminsw %xmm7, %xmm3
-; SSE41-NEXT: pminsw %xmm5, %xmm3
-; SSE41-NEXT: pminsw %xmm1, %xmm3
-; SSE41-NEXT: pminsw %xmm6, %xmm2
-; SSE41-NEXT: pminsw %xmm4, %xmm2
-; SSE41-NEXT: pminsw %xmm3, %xmm2
-; SSE41-NEXT: pminsw %xmm0, %xmm2
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2
-; SSE41-NEXT: phminposuw %xmm2, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpminsw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpminsw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminsw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpminsw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: xorl $32768, %eax # imm = 0x8000
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminsw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpminsw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: xorl $32768, %eax # imm = 0x8000
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpminsw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpminsw %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminsw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: xorl $32768, %eax # imm = 0x8000
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16> %a0)
- ret i16 %1
-}
-
-;
-; vXi8
-;
-
-define i8 @test_v2i8(<2 x i8> %a0) {
-; SSE2-LABEL: test_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminsb %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v4i8(<4 x i8> %a0) {
-; SSE2-LABEL: test_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pminsb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pminsb %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v8i8(<8 x i8> %a0) {
-; SSE2-LABEL: test_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminsb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pminsb %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminsb %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v16i8(<16 x i8> %a0) {
-; SSE2-LABEL: test_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: xorb $-128, %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vphminposuw %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: xorb $-128, %al
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: xorb $-128, %al
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v32i8(<32 x i8> %a0) {
-; SSE2-LABEL: test_v32i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminsb %xmm1, %xmm0
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: xorb $-128, %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: xorb $-128, %al
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: xorb $-128, %al
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: xorb $-128, %al
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v64i8(<64 x i8> %a0) {
-; SSE2-LABEL: test_v64i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminsb %xmm3, %xmm1
-; SSE41-NEXT: pminsb %xmm2, %xmm1
-; SSE41-NEXT: pminsb %xmm0, %xmm1
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pminub %xmm1, %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: xorb $-128, %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpminsb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminsb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: xorb $-128, %al
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: xorb $-128, %al
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: xorb $-128, %al
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v128i8(<128 x i8> %a0) {
-; SSE2-LABEL: test_v128i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm5, %xmm8
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm8
-; SSE2-NEXT: pand %xmm8, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm8
-; SSE2-NEXT: por %xmm1, %xmm8
-; SSE2-NEXT: movdqa %xmm7, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtb %xmm8, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm8
-; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: por %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v128i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminsb %xmm7, %xmm3
-; SSE41-NEXT: pminsb %xmm5, %xmm3
-; SSE41-NEXT: pminsb %xmm1, %xmm3
-; SSE41-NEXT: pminsb %xmm6, %xmm2
-; SSE41-NEXT: pminsb %xmm4, %xmm2
-; SSE41-NEXT: pminsb %xmm3, %xmm2
-; SSE41-NEXT: pminsb %xmm0, %xmm2
-; SSE41-NEXT: pxor {{.*}}(%rip), %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pminub %xmm2, %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: xorb $-128, %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v128i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpminsb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpminsb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminsb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpminsb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: xorb $-128, %al
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v128i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminsb %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpminsb %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: xorb $-128, %al
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v128i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpminsb %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpminsb %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminsb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: xorb $-128, %al
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8> %a0)
- ret i8 %1
-}
-
-declare i64 @llvm.experimental.vector.reduce.smin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.smin.v16i64(<16 x i64>)
-
-declare i32 @llvm.experimental.vector.reduce.smin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.smin.v32i32(<32 x i32>)
-
-declare i16 @llvm.experimental.vector.reduce.smin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.smin.v64i16(<64 x i16>)
-
-declare i8 @llvm.experimental.vector.reduce.smin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.smin.v128i8(<128 x i8>)
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
-
-;
-; vXi64
-;
-
-define i64 @test_v2i64(<2 x i64> %a0) {
-; SSE2-LABEL: test_v2i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movq %xmm2, %rax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
-; AVX-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v4i64(<4 x i64> %a0) {
-; SSE2-LABEL: test_v4i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: pxor %xmm3, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movq %xmm2, %rax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm4
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v4i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v4i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v8i64(<8 x i64> %a0) {
-; SSE2-LABEL: test_v8i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm2, %xmm6
-; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm0, %xmm7
-; SSE41-NEXT: pxor %xmm5, %xmm7
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm5, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm0
-; SSE41-NEXT: xorpd %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: xorpd %xmm5, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v8i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpxor %xmm2, %xmm4, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpxor %xmm2, %xmm6, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm4, %xmm4
-; AVX1-NEXT: vxorpd %xmm2, %xmm4, %xmm5
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorpd %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vxorpd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm4
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm3
-; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm4
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v8i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v8i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v16i64(<16 x i64> %a0) {
-; SSE2-LABEL: test_v16i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm5, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm1, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: movdqa %xmm10, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm10, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm9
-; SSE2-NEXT: por %xmm1, %xmm9
-; SSE2-NEXT: movdqa %xmm7, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm4, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm9
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm9, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm8
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm10
-; SSE41-NEXT: pxor %xmm9, %xmm10
-; SSE41-NEXT: movdqa %xmm10, %xmm11
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm11
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm11, %xmm0
-; SSE41-NEXT: por %xmm10, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm1
-; SSE41-NEXT: pxor %xmm9, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm10
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: pxor %xmm9, %xmm1
-; SSE41-NEXT: movdqa %xmm8, %xmm3
-; SSE41-NEXT: pxor %xmm9, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm9, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
-; SSE41-NEXT: movapd %xmm6, %xmm0
-; SSE41-NEXT: xorpd %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm4, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
-; SSE41-NEXT: movapd %xmm7, %xmm0
-; SSE41-NEXT: xorpd %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm5, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
-; SSE41-NEXT: movapd %xmm7, %xmm0
-; SSE41-NEXT: xorpd %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm6, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm7, %xmm2
-; SSE41-NEXT: pxor %xmm9, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm9
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm4, %xmm8, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10
-; AVX1-NEXT: vpxor %xmm4, %xmm10, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm11
-; AVX1-NEXT: vpxor %xmm4, %xmm11, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm12
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm13
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3
-; AVX1-NEXT: vblendvpd %xmm13, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm12, %xmm7, %xmm11, %xmm3
-; AVX1-NEXT: vxorpd %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vblendvpd %xmm9, %xmm10, %xmm8, %xmm6
-; AVX1-NEXT: vxorpd %xmm4, %xmm6, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm3, %xmm3
-; AVX1-NEXT: vxorpd %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2
-; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm6
-; AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm2
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm5, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vxorpd %ymm4, %ymm1, %ymm2
-; AVX2-NEXT: vxorpd %ymm4, %ymm0, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm2
-; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm3
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2
-; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v16i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v16i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64> %a0)
- ret i64 %1
-}
-
-;
-; vXi32
-;
-
-define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE2-LABEL: test_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movd %xmm3, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxud %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE2-LABEL: test_v4i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmaxud %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pmaxud %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE2-LABEL: test_v8i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movd %xmm3, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxud %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmaxud %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pmaxud %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE2-LABEL: test_v16i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxud %xmm3, %xmm1
-; SSE41-NEXT: pmaxud %xmm2, %xmm1
-; SSE41-NEXT: pmaxud %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pmaxud %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxud %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmaxud %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxud %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE2-LABEL: test_v32i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm5, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: movdqa %xmm1, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm9
-; SSE2-NEXT: por %xmm1, %xmm9
-; SSE2-NEXT: movdqa %xmm7, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm4
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm4, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm9
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm9, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxud %xmm6, %xmm2
-; SSE41-NEXT: pmaxud %xmm7, %xmm3
-; SSE41-NEXT: pmaxud %xmm5, %xmm3
-; SSE41-NEXT: pmaxud %xmm1, %xmm3
-; SSE41-NEXT: pmaxud %xmm4, %xmm2
-; SSE41-NEXT: pmaxud %xmm3, %xmm2
-; SSE41-NEXT: pmaxud %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: pmaxud %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxud %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpmaxud %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmaxud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpmaxud %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmaxud %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmaxud %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmaxud %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpmaxud %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpmaxud %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32> %a0)
- ret i32 %1
-}
-
-;
-; vXi16
-;
-
-define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE2-LABEL: test_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pmaxuw %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE2-LABEL: test_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxuw %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmaxuw %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE2-LABEL: test_v8i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: notl %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vphminposuw %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: notl %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512BW-LABEL: test_v8i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: notl %eax
-; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: notl %eax
-; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512VL-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE2-LABEL: test_v16i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxuw %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: notl %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: notl %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: notl %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v16i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: notl %eax
-; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v16i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: notl %eax
-; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE2-LABEL: test_v32i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pmaxsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pmaxsw %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pmaxsw %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxuw %xmm3, %xmm1
-; SSE41-NEXT: pmaxuw %xmm2, %xmm1
-; SSE41-NEXT: pmaxuw %xmm0, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: notl %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmaxuw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxuw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: notl %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: notl %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v32i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: notl %eax
-; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v32i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: notl %eax
-; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE2-LABEL: test_v64i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pmaxsw %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pmaxsw %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pmaxsw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pmaxsw %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pmaxsw %xmm5, %xmm1
-; SSE2-NEXT: pmaxsw %xmm4, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pmaxsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pmaxsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxuw %xmm7, %xmm3
-; SSE41-NEXT: pmaxuw %xmm5, %xmm3
-; SSE41-NEXT: pmaxuw %xmm1, %xmm3
-; SSE41-NEXT: pmaxuw %xmm6, %xmm2
-; SSE41-NEXT: pmaxuw %xmm4, %xmm2
-; SSE41-NEXT: pmaxuw %xmm3, %xmm2
-; SSE41-NEXT: pmaxuw %xmm0, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: notl %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpmaxuw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmaxuw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxuw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaxuw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: notl %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmaxuw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmaxuw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: notl %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v64i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512BW-NEXT: vmovd %xmm0, %eax
-; AVX512BW-NEXT: notl %eax
-; AVX512BW-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v64i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: notl %eax
-; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16> %a0)
- ret i16 %1
-}
-
-;
-; vXi8
-;
-
-define i8 @test_v2i8(<2 x i8> %a0) {
-; SSE2-LABEL: test_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pmaxub %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v4i8(<4 x i8> %a0) {
-; SSE2-LABEL: test_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pmaxub %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pmaxub %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v8i8(<8 x i8> %a0) {
-; SSE2-LABEL: test_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmaxub %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmaxub %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pmaxub %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v16i8(<16 x i8> %a0) {
-; SSE2-LABEL: test_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pminub %xmm1, %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: notb %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vphminposuw %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: notb %al
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512BW-LABEL: test_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: notb %al
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512VL-NEXT: notb %al
-; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512VL-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v32i8(<32 x i8> %a0) {
-; SSE2-LABEL: test_v32i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxub %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pminub %xmm1, %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: notb %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: notb %al
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: notb %al
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v32i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: notb %al
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v32i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512VL-NEXT: notb %al
-; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v64i8(<64 x i8> %a0) {
-; SSE2-LABEL: test_v64i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pmaxub %xmm3, %xmm1
-; SSE2-NEXT: pmaxub %xmm2, %xmm1
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxub %xmm3, %xmm1
-; SSE41-NEXT: pmaxub %xmm2, %xmm1
-; SSE41-NEXT: pmaxub %xmm0, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: notb %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmaxub %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpmaxub %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: notb %al
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: notb %al
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v64i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: notb %al
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v64i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512VL-NEXT: notb %al
-; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v128i8(<128 x i8> %a0) {
-; SSE2-LABEL: test_v128i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pmaxub %xmm6, %xmm2
-; SSE2-NEXT: pmaxub %xmm7, %xmm3
-; SSE2-NEXT: pmaxub %xmm5, %xmm3
-; SSE2-NEXT: pmaxub %xmm1, %xmm3
-; SSE2-NEXT: pmaxub %xmm4, %xmm2
-; SSE2-NEXT: pmaxub %xmm3, %xmm2
-; SSE2-NEXT: pmaxub %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: pmaxub %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pmaxub %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pmaxub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v128i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmaxub %xmm7, %xmm3
-; SSE41-NEXT: pmaxub %xmm5, %xmm3
-; SSE41-NEXT: pmaxub %xmm1, %xmm3
-; SSE41-NEXT: pmaxub %xmm6, %xmm2
-; SSE41-NEXT: pmaxub %xmm4, %xmm2
-; SSE41-NEXT: pmaxub %xmm3, %xmm2
-; SSE41-NEXT: pmaxub %xmm0, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: notb %al
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v128i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpmaxub %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpmaxub %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxub %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmaxub %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: notb %al
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v128i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmaxub %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpmaxub %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: notb %al
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v128i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512BW-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512BW-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512BW-NEXT: notb %al
-; AVX512BW-NEXT: # kill: def $al killed $al killed $eax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v128i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmaxub %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpmaxub %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpmaxub %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512VL-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512VL-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512VL-NEXT: notb %al
-; AVX512VL-NEXT: # kill: def $al killed $al killed $eax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8> %a0)
- ret i8 %1
-}
-
-declare i64 @llvm.experimental.vector.reduce.umax.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umax.v16i64(<16 x i64>)
-
-declare i32 @llvm.experimental.vector.reduce.umax.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umax.v32i32(<32 x i32>)
-
-declare i16 @llvm.experimental.vector.reduce.umax.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umax.v64i16(<64 x i16>)
-
-declare i8 @llvm.experimental.vector.reduce.umax.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umax.v128i8(<128 x i8>)
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
-
-;
-; vXi64
-;
-
-define i64 @test_v2i64(<2 x i64> %a0) {
-; SSE2-LABEL: test_v2i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movq %xmm2, %rax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX-NEXT: vpxor %xmm2, %xmm1, %xmm2
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
-;
-; AVX512BW-LABEL: test_v2i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v2i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v4i64(<4 x i64> %a0) {
-; SSE2-LABEL: test_v4i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movq %xmm2, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: pxor %xmm3, %xmm4
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movq %xmm2, %rax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpxor %xmm1, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorpd %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vxorpd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm4
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v4i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v4i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v8i64(<8 x i64> %a0) {
-; SSE2-LABEL: test_v8i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm8, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm5
-; SSE2-NEXT: por %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm5, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm3, %xmm6
-; SSE41-NEXT: pxor %xmm5, %xmm6
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm0
-; SSE41-NEXT: xorpd %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm1
-; SSE41-NEXT: xorpd %xmm5, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm5, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v8i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpxor %xmm3, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm6
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm1
-; AVX1-NEXT: vblendvpd %xmm4, %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vxorpd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorpd %xmm3, %xmm0, %xmm2
-; AVX1-NEXT: vxorpd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm4
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm4
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
-; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vxorpd %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vxorpd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v8i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v8i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v16i64(<16 x i64> %a0) {
-; SSE2-LABEL: test_v16i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm2, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa %xmm6, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: movdqa %xmm10, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm10, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm9
-; SSE2-NEXT: por %xmm2, %xmm9
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm10
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm10
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm10[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm10[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm7, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm9, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm8
-; SSE2-NEXT: movdqa %xmm8, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movq %xmm3, %rax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm6, %xmm10
-; SSE41-NEXT: pxor %xmm9, %xmm10
-; SSE41-NEXT: movdqa %xmm10, %xmm11
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm11
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm11, %xmm0
-; SSE41-NEXT: por %xmm10, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pxor %xmm9, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm10
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm7, %xmm2
-; SSE41-NEXT: pxor %xmm9, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm8
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm5, %xmm2
-; SSE41-NEXT: pxor %xmm9, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movapd %xmm5, %xmm0
-; SSE41-NEXT: xorpd %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm7
-; SSE41-NEXT: movapd %xmm4, %xmm0
-; SSE41-NEXT: xorpd %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm6, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm6
-; SSE41-NEXT: movapd %xmm6, %xmm0
-; SSE41-NEXT: xorpd %xmm9, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,3,0,1]
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm9
-; SSE41-NEXT: movdqa %xmm9, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm9, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: movq %xmm1, %rax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm8
-; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm6
-; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm9
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12
-; AVX1-NEXT: vpxor %xmm4, %xmm12, %xmm10
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm13
-; AVX1-NEXT: vpxor %xmm4, %xmm13, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm10, %xmm5, %xmm10
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm5, %xmm11
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT: vpxor %xmm4, %xmm7, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm11, %xmm6, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vxorpd %xmm4, %xmm5, %xmm11
-; AVX1-NEXT: vblendvpd %xmm10, %xmm12, %xmm13, %xmm7
-; AVX1-NEXT: vxorpd %xmm4, %xmm7, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm11, %xmm6, %xmm6
-; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm8, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm1
-; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm7, %xmm2
-; AVX1-NEXT: vxorpd %xmm4, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorpd %xmm4, %xmm0, %xmm2
-; AVX1-NEXT: vxorpd %xmm4, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5
-; AVX2-NEXT: vpxor %ymm4, %ymm3, %ymm6
-; AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm3
-; AVX2-NEXT: vpxor %ymm4, %ymm2, %ymm5
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vxorpd %ymm4, %ymm0, %ymm2
-; AVX2-NEXT: vxorpd %ymm4, %ymm1, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2
-; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vxorpd %xmm4, %xmm0, %xmm2
-; AVX2-NEXT: vxorpd %xmm4, %xmm1, %xmm3
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512BW-LABEL: test_v16i64:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vmovq %xmm0, %rax
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512VL-LABEL: test_v16i64:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512VL-NEXT: vpminuq %zmm1, %zmm0, %zmm0
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512VL-NEXT: vpminuq %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rax
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64> %a0)
- ret i64 %1
-}
-
-;
-; vXi32
-;
-
-define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE2-LABEL: test_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminud %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE2-LABEL: test_v4i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pminud %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pminud %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE2-LABEL: test_v8i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movd %xmm2, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminud %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pminud %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pminud %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE2-LABEL: test_v16i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm6
-; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm6, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: movd %xmm4, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminud %xmm3, %xmm1
-; SSE41-NEXT: pminud %xmm2, %xmm1
-; SSE41-NEXT: pminud %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pminud %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminud %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpminud %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE2-LABEL: test_v32i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm10
-; SSE2-NEXT: pxor %xmm8, %xmm10
-; SSE2-NEXT: movdqa %xmm6, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm9
-; SSE2-NEXT: por %xmm2, %xmm9
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm4, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm7, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm7, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm5, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm4, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm9, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm8
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm8
-; SSE2-NEXT: pand %xmm8, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm8
-; SSE2-NEXT: por %xmm3, %xmm8
-; SSE2-NEXT: movd %xmm8, %eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminud %xmm6, %xmm2
-; SSE41-NEXT: pminud %xmm7, %xmm3
-; SSE41-NEXT: pminud %xmm5, %xmm3
-; SSE41-NEXT: pminud %xmm1, %xmm3
-; SSE41-NEXT: pminud %xmm4, %xmm2
-; SSE41-NEXT: pminud %xmm3, %xmm2
-; SSE41-NEXT: pminud %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminud %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpminud %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpminud %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpminud %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminud %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpminud %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpminud %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32> %a0)
- ret i32 %1
-}
-
-;
-; vXi16
-;
-
-define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE2-LABEL: test_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pminuw %xmm0, %xmm1
-; SSE41-NEXT: movd %xmm1, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE2-LABEL: test_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminuw %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pminuw %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE2-LABEL: test_v8i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vphminposuw %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE2-LABEL: test_v16i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminuw %xmm1, %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v16i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE2-LABEL: test_v32i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pminsw %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pminsw %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminuw %xmm3, %xmm1
-; SSE41-NEXT: pminuw %xmm2, %xmm1
-; SSE41-NEXT: pminuw %xmm0, %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpminuw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE2-LABEL: test_v64i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: pminsw %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: pminsw %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm7
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: pminsw %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm5
-; SSE2-NEXT: pminsw %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pminsw %xmm5, %xmm1
-; SSE2-NEXT: pminsw %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: pminsw %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: pminsw %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: xorl $32768, %eax # imm = 0x8000
-; SSE2-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminuw %xmm7, %xmm3
-; SSE41-NEXT: pminuw %xmm5, %xmm3
-; SSE41-NEXT: pminuw %xmm1, %xmm3
-; SSE41-NEXT: pminuw %xmm6, %xmm2
-; SSE41-NEXT: pminuw %xmm4, %xmm2
-; SSE41-NEXT: pminuw %xmm3, %xmm2
-; SSE41-NEXT: pminuw %xmm0, %xmm2
-; SSE41-NEXT: phminposuw %xmm2, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpminuw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminuw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpminuw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminuw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpminuw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpminuw %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpminuw %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminuw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16> %a0)
- ret i16 %1
-}
-
-;
-; vXi8
-;
-
-define i8 @test_v2i8(<2 x i8> %a0) {
-; SSE2-LABEL: test_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v4i8(<4 x i8> %a0) {
-; SSE2-LABEL: test_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pminub %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v8i8(<8 x i8> %a0) {
-; SSE2-LABEL: test_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pminub %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v16i8(<16 x i8> %a0) {
-; SSE2-LABEL: test_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vphminposuw %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: test_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v32i8(<32 x i8> %a0) {
-; SSE2-LABEL: test_v32i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminub %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: phminposuw %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v64i8(<64 x i8> %a0) {
-; SSE2-LABEL: test_v64i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pminub %xmm3, %xmm1
-; SSE2-NEXT: pminub %xmm2, %xmm1
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminub %xmm3, %xmm1
-; SSE41-NEXT: pminub %xmm2, %xmm1
-; SSE41-NEXT: pminub %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pminub %xmm1, %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpminub %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpminub %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v128i8(<128 x i8> %a0) {
-; SSE2-LABEL: test_v128i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pminub %xmm6, %xmm2
-; SSE2-NEXT: pminub %xmm7, %xmm3
-; SSE2-NEXT: pminub %xmm5, %xmm3
-; SSE2-NEXT: pminub %xmm1, %xmm3
-; SSE2-NEXT: pminub %xmm4, %xmm2
-; SSE2-NEXT: pminub %xmm3, %xmm2
-; SSE2-NEXT: pminub %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: pminub %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pminub %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pminub %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v128i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pminub %xmm7, %xmm3
-; SSE41-NEXT: pminub %xmm5, %xmm3
-; SSE41-NEXT: pminub %xmm1, %xmm3
-; SSE41-NEXT: pminub %xmm6, %xmm2
-; SSE41-NEXT: pminub %xmm4, %xmm2
-; SSE41-NEXT: pminub %xmm3, %xmm2
-; SSE41-NEXT: pminub %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pminub %xmm2, %xmm0
-; SSE41-NEXT: phminposuw %xmm0, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v128i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpminub %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpminub %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpminub %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vphminposuw %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v128i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminub %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpminub %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vphminposuw %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v128i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpminub %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpminub %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpminub %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vphminposuw %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8> %a0)
- ret i8 %1
-}
-
-declare i64 @llvm.experimental.vector.reduce.umin.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.umin.v16i64(<16 x i64>)
-
-declare i32 @llvm.experimental.vector.reduce.umin.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.umin.v32i32(<32 x i32>)
-
-declare i16 @llvm.experimental.vector.reduce.umin.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.umin.v64i16(<64 x i16>)
-
-declare i8 @llvm.experimental.vector.reduce.umin.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.umin.v128i8(<128 x i8>)
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512
-
-;
-; vXi64
-;
-
-define i64 @test_v2i64(<2 x i64> %a0) {
-; SSE-LABEL: test_v2i64:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i64:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v4i64(<4 x i64> %a0) {
-; SSE-LABEL: test_v4i64:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movq %xmm1, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v4i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v8i64(<8 x i64> %a0) {
-; SSE-LABEL: test_v8i64:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v8i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> %a0)
- ret i64 %1
-}
-
-define i64 @test_v16i64(<16 x i64> %a0) {
-; SSE-LABEL: test_v16i64:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pxor %xmm3, %xmm2
-; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v16i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, %rax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, %rax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> %a0)
- ret i64 %1
-}
-
-;
-; vXi32
-;
-
-define i32 @test_v2i32(<2 x i32> %a0) {
-; SSE-LABEL: test_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v4i32(<4 x i32> %a0) {
-; SSE-LABEL: test_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v8i32(<8 x i32> %a0) {
-; SSE-LABEL: test_v8i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v8i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v16i32(<16 x i32> %a0) {
-; SSE-LABEL: test_v16i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v16i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> %a0)
- ret i32 %1
-}
-
-define i32 @test_v32i32(<32 x i32> %a0) {
-; SSE-LABEL: test_v32i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pxor %xmm3, %xmm2
-; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v32i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> %a0)
- ret i32 %1
-}
-
-;
-; vXi16
-;
-
-define i16 @test_v2i16(<2 x i16> %a0) {
-; SSE-LABEL: test_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v4i16(<4 x i16> %a0) {
-; SSE-LABEL: test_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v8i16(<8 x i16> %a0) {
-; SSE-LABEL: test_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: test_v8i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v16i16(<16 x i16> %a0) {
-; SSE-LABEL: test_v16i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrld $16, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movd %xmm1, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v16i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v32i16(<32 x i16> %a0) {
-; SSE-LABEL: test_v32i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v32i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> %a0)
- ret i16 %1
-}
-
-define i16 @test_v64i16(<64 x i16> %a0) {
-; SSE-LABEL: test_v64i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm5, %xmm3
-; SSE-NEXT: pxor %xmm1, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm2
-; SSE-NEXT: pxor %xmm3, %xmm2
-; SSE-NEXT: pxor %xmm0, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT: pxor %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: psrld $16, %xmm0
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: movd %xmm0, %eax
-; SSE-NEXT: # kill: def $ax killed $ax killed $eax
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: test_v64i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovd %xmm0, %eax
-; AVX1-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovd %xmm0, %eax
-; AVX2-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovd %xmm0, %eax
-; AVX512-NEXT: # kill: def $ax killed $ax killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> %a0)
- ret i16 %1
-}
-
-;
-; vXi8
-;
-
-define i8 @test_v2i8(<2 x i8> %a0) {
-; SSE2-LABEL: test_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v4i8(<4 x i8> %a0) {
-; SSE2-LABEL: test_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v8i8(<8 x i8> %a0) {
-; SSE2-LABEL: test_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v16i8(<16 x i8> %a0) {
-; SSE2-LABEL: test_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: test_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpextrb $0, %xmm0, %eax
-; AVX-NEXT: # kill: def $al killed $al killed $eax
-; AVX-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v32i8(<32 x i8> %a0) {
-; SSE2-LABEL: test_v32i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $16, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v32i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrlw $8, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pextrb $0, %xmm0, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v32i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v32i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v64i8(<64 x i8> %a0) {
-; SSE2-LABEL: test_v64i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v64i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm3, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v64i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v64i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v64i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> %a0)
- ret i8 %1
-}
-
-define i8 @test_v128i8(<128 x i8> %a0) {
-; SSE2-LABEL: test_v128i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm3
-; SSE2-NEXT: pxor %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: psrld $16, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $8, %xmm1
-; SSE2-NEXT: pxor %xmm0, %xmm1
-; SSE2-NEXT: movd %xmm1, %eax
-; SSE2-NEXT: # kill: def $al killed $al killed $eax
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: test_v128i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm6, %xmm2
-; SSE41-NEXT: pxor %xmm7, %xmm3
-; SSE41-NEXT: pxor %xmm5, %xmm3
-; SSE41-NEXT: pxor %xmm1, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm2
-; SSE41-NEXT: pxor %xmm3, %xmm2
-; SSE41-NEXT: pxor %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm1
-; SSE41-NEXT: pextrb $0, %xmm1, %eax
-; SSE41-NEXT: # kill: def $al killed $al killed $eax
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: test_v128i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm2, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vxorps %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpextrb $0, %xmm0, %eax
-; AVX1-NEXT: # kill: def $al killed $al killed $eax
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: test_v128i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpextrb $0, %xmm0, %eax
-; AVX2-NEXT: # kill: def $al killed $al killed $eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: test_v128i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpextrb $0, %xmm0, %eax
-; AVX512-NEXT: # kill: def $al killed $al killed $eax
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> %a0)
- ret i8 %1
-}
-
-declare i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64>)
-declare i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64>)
-
-declare i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32>)
-declare i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32>)
-
-declare i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16>)
-declare i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16>)
-
-declare i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8>)
-declare i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8>)
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
-;
-; Just two 32-bit runs to make sure we do reasonable things there.
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X32-SSE,X32-SSE41
-
-define <8 x i16> @sext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_16i8_to_8i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_16i8_to_8i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: psraw $8, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_16i8_to_8i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sext_16i8_to_8i16:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_16i8_to_8i16:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: psraw $8, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_16i8_to_8i16:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %C = sext <8 x i8> %B to <8 x i16>
- ret <8 x i16> %C
-}
-
-define <16 x i16> @sext_16i8_to_16i16(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_16i8_to_16i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: psraw $8, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_16i8_to_16i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSSE3-NEXT: psraw $8, %xmm2
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSSE3-NEXT: psraw $8, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_16i8_to_16i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbw %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxbw %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_16i8_to_16i16:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_16i8_to_16i16:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_16i8_to_16i16:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_16i8_to_16i16:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X32-SSE2-NEXT: psraw $8, %xmm2
-; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; X32-SSE2-NEXT: psraw $8, %xmm1
-; X32-SSE2-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_16i8_to_16i16:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm1
-; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = sext <16 x i8> %A to <16 x i16>
- ret <16 x i16> %B
-}
-
-define <32 x i16> @sext_32i8_to_32i16(<32 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_32i8_to_32i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT: psraw $8, %xmm4
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSE2-NEXT: psraw $8, %xmm5
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT: psraw $8, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; SSE2-NEXT: psraw $8, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_32i8_to_32i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSSE3-NEXT: psraw $8, %xmm4
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; SSSE3-NEXT: psraw $8, %xmm5
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT: psraw $8, %xmm2
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; SSSE3-NEXT: psraw $8, %xmm3
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_32i8_to_32i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbw %xmm0, %xmm5
-; SSE41-NEXT: pmovsxbw %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxbw %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pmovsxbw %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_32i8_to_32i16:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxbw %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_32i8_to_32i16:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm1
-; AVX2-NEXT: vmovdqa %ymm2, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sext_32i8_to_32i16:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm2
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpmovsxbw %xmm0, %ymm1
-; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: sext_32i8_to_32i16:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0
-; AVX512BW-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_32i8_to_32i16:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; X32-SSE2-NEXT: psraw $8, %xmm4
-; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
-; X32-SSE2-NEXT: psraw $8, %xmm5
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; X32-SSE2-NEXT: psraw $8, %xmm2
-; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15]
-; X32-SSE2-NEXT: psraw $8, %xmm3
-; X32-SSE2-NEXT: movdqa %xmm4, %xmm0
-; X32-SSE2-NEXT: movdqa %xmm5, %xmm1
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_32i8_to_32i16:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm5
-; X32-SSE41-NEXT: pmovsxbw %xmm1, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm4
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm3
-; X32-SSE41-NEXT: movdqa %xmm5, %xmm0
-; X32-SSE41-NEXT: movdqa %xmm4, %xmm1
-; X32-SSE41-NEXT: retl
-entry:
- %B = sext <32 x i8> %A to <32 x i16>
- ret <32 x i16> %B
-}
-
-define <4 x i32> @sext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_16i8_to_4i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_16i8_to_4i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: psrad $24, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_16i8_to_4i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sext_16i8_to_4i32:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_16i8_to_4i32:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X32-SSE2-NEXT: psrad $24, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_16i8_to_4i32:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %C = sext <4 x i8> %B to <4 x i32>
- ret <4 x i32> %C
-}
-
-define <8 x i32> @sext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_16i8_to_8i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psrad $24, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_16i8_to_8i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT: psrad $24, %xmm2
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_16i8_to_8i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_16i8_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_16i8_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_16i8_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxbd %xmm0, %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_16i8_to_8i32:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; X32-SSE2-NEXT: psrad $24, %xmm2
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X32-SSE2-NEXT: psrad $24, %xmm1
-; X32-SSE2-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_16i8_to_8i32:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm1
-; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %C = sext <8 x i8> %B to <8 x i32>
- ret <8 x i32> %C
-}
-
-define <16 x i32> @sext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_16i8_to_16i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSE2-NEXT: psrad $24, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psrad $24, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: psrad $24, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_16i8_to_16i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; SSSE3-NEXT: psrad $24, %xmm4
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT: psrad $24, %xmm2
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: psrad $24, %xmm3
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_16i8_to_16i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovsxbd %xmm1, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxbd %xmm2, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_16i8_to_16i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovsxbd %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_16i8_to_16i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovsxbd %xmm0, %ymm1
-; AVX2-NEXT: vmovdqa %ymm2, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_16i8_to_16i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_16i8_to_16i32:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; X32-SSE2-NEXT: psrad $24, %xmm4
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: psrad $24, %xmm1
-; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; X32-SSE2-NEXT: psrad $24, %xmm2
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; X32-SSE2-NEXT: psrad $24, %xmm3
-; X32-SSE2-NEXT: movdqa %xmm4, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_16i8_to_16i32:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm4
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X32-SSE41-NEXT: pmovsxbd %xmm1, %xmm1
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxbd %xmm2, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm3
-; X32-SSE41-NEXT: movdqa %xmm4, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = sext <16 x i8> %A to <16 x i32>
- ret <16 x i32> %B
-}
-
-define <2 x i64> @sext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_16i8_to_2i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_16i8_to_2i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: psrad $24, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_16i8_to_2i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbq %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sext_16i8_to_2i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxbq %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_16i8_to_2i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X32-SSE2-NEXT: pxor %xmm1, %xmm1
-; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; X32-SSE2-NEXT: psrad $24, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_16i8_to_2i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
- %C = sext <2 x i8> %B to <2 x i64>
- ret <2 x i64> %C
-}
-
-define <4 x i64> @sext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_16i8_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_16i8_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_16i8_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbq %xmm0, %xmm2
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmovsxbq %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_16i8_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_16i8_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_16i8_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_16i8_to_4i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-SSE2-NEXT: psrad $24, %xmm1
-; X32-SSE2-NEXT: pxor %xmm2, %xmm2
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; X32-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_16i8_to_4i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2
-; X32-SSE41-NEXT: psrld $16, %xmm0
-; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1
-; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %C = sext <4 x i8> %B to <4 x i64>
- ret <4 x i64> %C
-}
-
-define <8 x i64> @sext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_16i8_to_8i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: psrad $24, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_16i8_to_8i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: psrad $24, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_16i8_to_8i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbq %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pmovsxbq %xmm1, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovsxbq %xmm2, %xmm2
-; SSE41-NEXT: psrlq $48, %xmm0
-; SSE41-NEXT: pmovsxbq %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_16i8_to_8i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
-; AVX1-NEXT: vpmovsxbq %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_16i8_to_8i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxbq %xmm0, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovsxbq %xmm0, %ymm1
-; AVX2-NEXT: vmovdqa %ymm2, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_16i8_to_8i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxbq %xmm0, %zmm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_16i8_to_8i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-SSE2-NEXT: psrad $24, %xmm1
-; X32-SSE2-NEXT: pxor %xmm5, %xmm5
-; X32-SSE2-NEXT: pxor %xmm2, %xmm2
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; X32-SSE2-NEXT: movdqa %xmm1, %xmm4
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; X32-SSE2-NEXT: psrad $24, %xmm3
-; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; X32-SSE2-NEXT: movdqa %xmm3, %xmm2
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; X32-SSE2-NEXT: movdqa %xmm4, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_16i8_to_8i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm4
-; X32-SSE41-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE41-NEXT: psrld $16, %xmm1
-; X32-SSE41-NEXT: pmovsxbq %xmm1, %xmm1
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; X32-SSE41-NEXT: pmovsxbq %xmm2, %xmm2
-; X32-SSE41-NEXT: psrlq $48, %xmm0
-; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm3
-; X32-SSE41-NEXT: movdqa %xmm4, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %C = sext <8 x i8> %B to <8 x i64>
- ret <8 x i64> %C
-}
-
-define <4 x i32> @sext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_8i16_to_4i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_8i16_to_4i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_8i16_to_4i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sext_8i16_to_4i32:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_8i16_to_4i32:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X32-SSE2-NEXT: psrad $16, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_8i16_to_4i32:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %C = sext <4 x i16> %B to <4 x i32>
- ret <4 x i32> %C
-}
-
-define <8 x i32> @sext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_8i16_to_8i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_8i16_to_8i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT: psrad $16, %xmm2
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_8i16_to_8i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_8i16_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_8i16_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_8i16_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_8i16_to_8i32:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; X32-SSE2-NEXT: psrad $16, %xmm2
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X32-SSE2-NEXT: psrad $16, %xmm1
-; X32-SSE2-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_8i16_to_8i32:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm1
-; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = sext <8 x i16> %A to <8 x i32>
- ret <8 x i32> %B
-}
-
-define <16 x i32> @sext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_16i16_to_16i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT: psrad $16, %xmm4
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT: psrad $16, %xmm5
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_16i16_to_16i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSSE3-NEXT: psrad $16, %xmm4
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSSE3-NEXT: psrad $16, %xmm5
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT: psrad $16, %xmm2
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSSE3-NEXT: psrad $16, %xmm3
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_16i16_to_16i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm5
-; SSE41-NEXT: pmovsxwd %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pmovsxwd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_16i16_to_16i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxwd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_16i16_to_16i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm1
-; AVX2-NEXT: vmovdqa %ymm2, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_16i16_to_16i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxwd %ymm0, %zmm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_16i16_to_16i32:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; X32-SSE2-NEXT: psrad $16, %xmm4
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; X32-SSE2-NEXT: psrad $16, %xmm5
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; X32-SSE2-NEXT: psrad $16, %xmm2
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; X32-SSE2-NEXT: psrad $16, %xmm3
-; X32-SSE2-NEXT: movdqa %xmm4, %xmm0
-; X32-SSE2-NEXT: movdqa %xmm5, %xmm1
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_16i16_to_16i32:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm5
-; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm4
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxwd %xmm0, %xmm3
-; X32-SSE41-NEXT: movdqa %xmm5, %xmm0
-; X32-SSE41-NEXT: movdqa %xmm4, %xmm1
-; X32-SSE41-NEXT: retl
-entry:
- %B = sext <16 x i16> %A to <16 x i32>
- ret <16 x i32> %B
-}
-
-define <2 x i64> @sext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_8i16_to_2i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_8i16_to_2i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_8i16_to_2i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxwq %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sext_8i16_to_2i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxwq %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_8i16_to_2i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X32-SSE2-NEXT: pxor %xmm1, %xmm1
-; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; X32-SSE2-NEXT: psrad $16, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_8i16_to_2i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
- %C = sext <2 x i16> %B to <2 x i64>
- ret <2 x i64> %C
-}
-
-define <4 x i64> @sext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_8i16_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_8i16_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_8i16_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxwq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovsxwq %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_8i16_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_8i16_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_8i16_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxwq %xmm0, %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_8i16_to_4i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-SSE2-NEXT: psrad $16, %xmm1
-; X32-SSE2-NEXT: pxor %xmm2, %xmm2
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; X32-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_8i16_to_4i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm1
-; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %C = sext <4 x i16> %B to <4 x i64>
- ret <4 x i64> %C
-}
-
-define <8 x i64> @sext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_8i16_to_8i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_8i16_to_8i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSSE3-NEXT: psrad $16, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_8i16_to_8i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxwq %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovsxwq %xmm1, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxwq %xmm2, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE41-NEXT: pmovsxwq %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_8i16_to_8i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
-; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_8i16_to_8i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxwq %xmm0, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovsxwq %xmm0, %ymm1
-; AVX2-NEXT: vmovdqa %ymm2, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_8i16_to_8i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxwq %xmm0, %zmm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_8i16_to_8i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-SSE2-NEXT: psrad $16, %xmm1
-; X32-SSE2-NEXT: pxor %xmm5, %xmm5
-; X32-SSE2-NEXT: pxor %xmm2, %xmm2
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; X32-SSE2-NEXT: movdqa %xmm1, %xmm4
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; X32-SSE2-NEXT: psrad $16, %xmm3
-; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; X32-SSE2-NEXT: movdqa %xmm3, %xmm2
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; X32-SSE2-NEXT: movdqa %xmm4, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_8i16_to_8i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm4
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; X32-SSE41-NEXT: pmovsxwq %xmm1, %xmm1
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxwq %xmm2, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; X32-SSE41-NEXT: pmovsxwq %xmm0, %xmm3
-; X32-SSE41-NEXT: movdqa %xmm4, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = sext <8 x i16> %A to <8 x i64>
- ret <8 x i64> %B
-}
-
-define <2 x i64> @sext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_4i32_to_2i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_4i32_to_2i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_4i32_to_2i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxdq %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sext_4i32_to_2i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_4i32_to_2i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: pxor %xmm1, %xmm1
-; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_4i32_to_2i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
- %C = sext <2 x i32> %B to <2 x i64>
- ret <2 x i64> %C
-}
-
-define <4 x i64> @sext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_4i32_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_4i32_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_4i32_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_4i32_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_4i32_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_4i32_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_4i32_to_4i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: pxor %xmm2, %xmm2
-; X32-SSE2-NEXT: pxor %xmm3, %xmm3
-; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_4i32_to_4i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
-; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %B = sext <4 x i32> %A to <4 x i64>
- ret <4 x i64> %B
-}
-
-define <8 x i64> @sext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_8i32_to_8i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_8i32_to_8i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm5, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_8i32_to_8i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxdq %xmm0, %xmm5
-; SSE41-NEXT: pmovsxdq %xmm1, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxdq %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pmovsxdq %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_8i32_to_8i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_8i32_to_8i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm1
-; AVX2-NEXT: vmovdqa %ymm2, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_8i32_to_8i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxdq %ymm0, %zmm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_8i32_to_8i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE2-NEXT: pxor %xmm4, %xmm4
-; X32-SSE2-NEXT: pxor %xmm3, %xmm3
-; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; X32-SSE2-NEXT: pxor %xmm5, %xmm5
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm5
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X32-SSE2-NEXT: pxor %xmm3, %xmm3
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1]
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_8i32_to_8i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm5
-; X32-SSE41-NEXT: pmovsxdq %xmm1, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm4
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm3
-; X32-SSE41-NEXT: movdqa %xmm5, %xmm0
-; X32-SSE41-NEXT: movdqa %xmm4, %xmm1
-; X32-SSE41-NEXT: retl
-entry:
- %B = sext <8 x i32> %A to <8 x i64>
- ret <8 x i64> %B
-}
-
-define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
-; SSE-LABEL: load_sext_2i1_to_2i64:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: movzbl (%rdi), %eax
-; SSE-NEXT: movq %rax, %rcx
-; SSE-NEXT: shlq $62, %rcx
-; SSE-NEXT: movq %rcx, %xmm0
-; SSE-NEXT: shlq $63, %rax
-; SSE-NEXT: movq %rax, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; SSE-NEXT: psrad $31, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: load_sext_2i1_to_2i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: movzbl (%rdi), %eax
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $62, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
-; AVX1-NEXT: shlq $63, %rax
-; AVX1-NEXT: vmovq %rax, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_2i1_to_2i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: movzbl (%rdi), %eax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $62, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: shlq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_sext_2i1_to_2i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: kmovw (%rdi), %k1
-; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_2i1_to_2i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movzbl (%eax), %eax
-; X32-SSE2-NEXT: movl %eax, %ecx
-; X32-SSE2-NEXT: shll $30, %ecx
-; X32-SSE2-NEXT: movd %ecx, %xmm0
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1]
-; X32-SSE2-NEXT: shll $31, %eax
-; X32-SSE2-NEXT: movd %eax, %xmm0
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-SSE2-NEXT: psrad $31, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movzbl (%eax), %eax
-; X32-SSE41-NEXT: movl %eax, %ecx
-; X32-SSE41-NEXT: shll $31, %ecx
-; X32-SSE41-NEXT: movd %ecx, %xmm0
-; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0
-; X32-SSE41-NEXT: shll $30, %eax
-; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0
-; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
-; X32-SSE41-NEXT: psrad $31, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <2 x i1>, <2 x i1>* %ptr
- %Y = sext <2 x i1> %X to <2 x i64>
- ret <2 x i64> %Y
-}
-
-define <2 x i64> @load_sext_2i8_to_2i64(<2 x i8> *%ptr) {
-; SSE2-LABEL: load_sext_2i8_to_2i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movzwl (%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_2i8_to_2i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movzwl (%rdi), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: psrad $24, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_2i8_to_2i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: load_sext_2i8_to_2i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxbq (%rdi), %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_2i8_to_2i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movzwl (%eax), %eax
-; X32-SSE2-NEXT: movd %eax, %xmm0
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X32-SSE2-NEXT: pxor %xmm1, %xmm1
-; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; X32-SSE2-NEXT: psrad $24, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_2i8_to_2i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <2 x i8>, <2 x i8>* %ptr
- %Y = sext <2 x i8> %X to <2 x i64>
- ret <2 x i64> %Y
-}
-
-define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
-; SSE2-LABEL: load_sext_4i1_to_4i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movl (%rdi), %eax
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shlq $60, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shlq $61, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movd %ecx, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shlq $62, %rcx
-; SSE2-NEXT: sarq $63, %rcx
-; SSE2-NEXT: movd %ecx, %xmm2
-; SSE2-NEXT: shlq $63, %rax
-; SSE2-NEXT: sarq $63, %rax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_4i1_to_4i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movl (%rdi), %eax
-; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: shlq $60, %rcx
-; SSSE3-NEXT: sarq $63, %rcx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: shlq $61, %rcx
-; SSSE3-NEXT: sarq $63, %rcx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: shlq $62, %rcx
-; SSSE3-NEXT: sarq $63, %rcx
-; SSSE3-NEXT: movd %ecx, %xmm2
-; SSSE3-NEXT: shlq $63, %rax
-; SSSE3-NEXT: sarq $63, %rax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_4i1_to_4i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movl (%rdi), %eax
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shlq $62, %rcx
-; SSE41-NEXT: sarq $63, %rcx
-; SSE41-NEXT: movq %rax, %rdx
-; SSE41-NEXT: shlq $63, %rdx
-; SSE41-NEXT: sarq $63, %rdx
-; SSE41-NEXT: movd %edx, %xmm0
-; SSE41-NEXT: pinsrd $1, %ecx, %xmm0
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shlq $61, %rcx
-; SSE41-NEXT: sarq $63, %rcx
-; SSE41-NEXT: pinsrd $2, %ecx, %xmm0
-; SSE41-NEXT: shlq $60, %rax
-; SSE41-NEXT: sarq $63, %rax
-; SSE41-NEXT: pinsrd $3, %eax, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_sext_4i1_to_4i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: movl (%rdi), %eax
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $62, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: movq %rax, %rdx
-; AVX1-NEXT: shlq $63, %rdx
-; AVX1-NEXT: sarq $63, %rdx
-; AVX1-NEXT: vmovd %edx, %xmm0
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $61, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shlq $60, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_4i1_to_4i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: movl (%rdi), %eax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $62, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: movq %rax, %rdx
-; AVX2-NEXT: shlq $63, %rdx
-; AVX2-NEXT: sarq $63, %rdx
-; AVX2-NEXT: vmovd %edx, %xmm0
-; AVX2-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $61, %rcx
-; AVX2-NEXT: sarq $63, %rcx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: shlq $60, %rax
-; AVX2-NEXT: sarq $63, %rax
-; AVX2-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_sext_4i1_to_4i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: kmovw (%rdi), %k1
-; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_4i1_to_4i32:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movl (%eax), %eax
-; X32-SSE2-NEXT: movl %eax, %ecx
-; X32-SSE2-NEXT: shll $28, %ecx
-; X32-SSE2-NEXT: movd %ecx, %xmm0
-; X32-SSE2-NEXT: movl %eax, %ecx
-; X32-SSE2-NEXT: shll $29, %ecx
-; X32-SSE2-NEXT: movd %ecx, %xmm1
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-SSE2-NEXT: movl %eax, %ecx
-; X32-SSE2-NEXT: shll $30, %ecx
-; X32-SSE2-NEXT: movd %ecx, %xmm2
-; X32-SSE2-NEXT: shll $31, %eax
-; X32-SSE2-NEXT: movd %eax, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-SSE2-NEXT: psrad $31, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movl (%eax), %eax
-; X32-SSE41-NEXT: movl %eax, %ecx
-; X32-SSE41-NEXT: shll $30, %ecx
-; X32-SSE41-NEXT: movl %eax, %edx
-; X32-SSE41-NEXT: shll $31, %edx
-; X32-SSE41-NEXT: movd %edx, %xmm0
-; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm0
-; X32-SSE41-NEXT: movl %eax, %ecx
-; X32-SSE41-NEXT: shll $29, %ecx
-; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm0
-; X32-SSE41-NEXT: shll $28, %eax
-; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
-; X32-SSE41-NEXT: psrad $31, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <4 x i1>, <4 x i1>* %ptr
- %Y = sext <4 x i1> %X to <4 x i32>
- ret <4 x i32> %Y
-}
-
-define <4 x i32> @load_sext_4i8_to_4i32(<4 x i8> *%ptr) {
-; SSE2-LABEL: load_sext_4i8_to_4i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_4i8_to_4i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: psrad $24, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_4i8_to_4i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbd (%rdi), %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: load_sext_4i8_to_4i32:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxbd (%rdi), %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_4i8_to_4i32:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X32-SSE2-NEXT: psrad $24, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_4i8_to_4i32:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <4 x i8>, <4 x i8>* %ptr
- %Y = sext <4 x i8> %X to <4 x i32>
- ret <4 x i32> %Y
-}
-
-define <4 x i64> @load_sext_4i1_to_4i64(<4 x i1> *%ptr) {
-; SSE2-LABEL: load_sext_4i1_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movl (%rdi), %eax
-; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $3, %ecx
-; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: movl %eax, %ecx
-; SSE2-NEXT: shrl $2, %ecx
-; SSE2-NEXT: movd %ecx, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: shrl %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
-; SSE2-NEXT: psllq $63, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
-; SSE2-NEXT: psllq $63, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_4i1_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movl (%rdi), %eax
-; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $3, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: movl %eax, %ecx
-; SSSE3-NEXT: shrl $2, %ecx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: shrl %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
-; SSSE3-NEXT: psllq $63, %xmm0
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
-; SSSE3-NEXT: psllq $63, %xmm1
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_4i1_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movl (%rdi), %eax
-; SSE41-NEXT: movl %eax, %ecx
-; SSE41-NEXT: shrl %ecx
-; SSE41-NEXT: movd %eax, %xmm1
-; SSE41-NEXT: pinsrd $1, %ecx, %xmm1
-; SSE41-NEXT: movl %eax, %ecx
-; SSE41-NEXT: shrl $2, %ecx
-; SSE41-NEXT: pinsrd $2, %ecx, %xmm1
-; SSE41-NEXT: shrl $3, %eax
-; SSE41-NEXT: pinsrd $3, %eax, %xmm1
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT: psllq $63, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; SSE41-NEXT: psllq $63, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_sext_4i1_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: movl (%rdi), %eax
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $62, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: movq %rax, %rdx
-; AVX1-NEXT: shlq $63, %rdx
-; AVX1-NEXT: sarq $63, %rdx
-; AVX1-NEXT: vmovd %edx, %xmm0
-; AVX1-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shlq $61, %rcx
-; AVX1-NEXT: sarq $63, %rcx
-; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: shlq $60, %rax
-; AVX1-NEXT: sarq $63, %rax
-; AVX1-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_4i1_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: movl (%rdi), %eax
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $60, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $61, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shlq $62, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: shlq $63, %rax
-; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_sext_4i1_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: kmovw (%rdi), %k1
-; AVX512-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_4i1_to_4i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movzbl (%eax), %eax
-; X32-SSE2-NEXT: movl %eax, %ecx
-; X32-SSE2-NEXT: shrl $3, %ecx
-; X32-SSE2-NEXT: movd %ecx, %xmm0
-; X32-SSE2-NEXT: movl %eax, %ecx
-; X32-SSE2-NEXT: shrl $2, %ecx
-; X32-SSE2-NEXT: movd %ecx, %xmm1
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-SSE2-NEXT: movd %eax, %xmm2
-; X32-SSE2-NEXT: shrl %eax
-; X32-SSE2-NEXT: movd %eax, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; X32-SSE2-NEXT: pand {{\.LCPI.*}}, %xmm2
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,1,3]
-; X32-SSE2-NEXT: psllq $63, %xmm0
-; X32-SSE2-NEXT: psrad $31, %xmm0
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,3,3]
-; X32-SSE2-NEXT: psllq $63, %xmm1
-; X32-SSE2-NEXT: psrad $31, %xmm1
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_4i1_to_4i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movzbl (%eax), %eax
-; X32-SSE41-NEXT: movl %eax, %ecx
-; X32-SSE41-NEXT: shrl %ecx
-; X32-SSE41-NEXT: movd %eax, %xmm1
-; X32-SSE41-NEXT: pinsrd $1, %ecx, %xmm1
-; X32-SSE41-NEXT: movl %eax, %ecx
-; X32-SSE41-NEXT: shrl $2, %ecx
-; X32-SSE41-NEXT: pinsrd $2, %ecx, %xmm1
-; X32-SSE41-NEXT: shrl $3, %eax
-; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1
-; X32-SSE41-NEXT: pand {{\.LCPI.*}}, %xmm1
-; X32-SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
-; X32-SSE41-NEXT: psllq $63, %xmm0
-; X32-SSE41-NEXT: psrad $31, %xmm0
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3]
-; X32-SSE41-NEXT: psllq $63, %xmm1
-; X32-SSE41-NEXT: psrad $31, %xmm1
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <4 x i1>, <4 x i1>* %ptr
- %Y = sext <4 x i1> %X to <4 x i64>
- ret <4 x i64> %Y
-}
-
-define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) {
-; SSE2-LABEL: load_sext_4i8_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_4i8_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_4i8_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
-; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_sext_4i8_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0
-; AVX1-NEXT: vpmovsxbq (%rdi), %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_4i8_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_sext_4i8_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_4i8_to_4i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-SSE2-NEXT: psrad $24, %xmm1
-; X32-SSE2-NEXT: pxor %xmm2, %xmm2
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; X32-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_4i8_to_4i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
-; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <4 x i8>, <4 x i8>* %ptr
- %Y = sext <4 x i8> %X to <4 x i64>
- ret <4 x i64> %Y
-}
-
-define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) {
-; SSE2-LABEL: load_sext_4i8_to_4i64_extract:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_4i8_to_4i64_extract:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: psrad $24, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_4i8_to_4i64_extract:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_sext_4i8_to_4i64_extract:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_4i8_to_4i64_extract:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_sext_4i8_to_4i64_extract:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbq (%rdi), %ymm0
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_4i8_to_4i64_extract:
-; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X32-SSE2-NEXT: psrad $24, %xmm0
-; X32-SSE2-NEXT: pxor %xmm1, %xmm1
-; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_4i8_to_4i64_extract:
-; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm0
-; X32-SSE41-NEXT: retl
- %ld = load <4 x i8>, <4 x i8>* %ptr
- %sext = sext <4 x i8> %ld to <4 x i64>
- %extract = shufflevector <4 x i64> %sext, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
- ret <2 x i64> %extract
-}
-
-define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
-; SSE-LABEL: load_sext_8i1_to_8i16:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
-; SSE-NEXT: pand %xmm1, %xmm0
-; SSE-NEXT: pcmpeqw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: load_sext_8i1_to_8i16:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_8i1_to_8i16:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_sext_8i1_to_8i16:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: kmovw (%rdi), %k1
-; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: load_sext_8i1_to_8i16:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: kmovw (%rdi), %k0
-; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; X32-SSE-LABEL: load_sext_8i1_to_8i16:
-; X32-SSE: # %bb.0: # %entry
-; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: pcmpeqw %xmm1, %xmm0
-; X32-SSE-NEXT: retl
-entry:
- %X = load <8 x i1>, <8 x i1>* %ptr
- %Y = sext <8 x i1> %X to <8 x i16>
- ret <8 x i16> %Y
-}
-
-define <8 x i16> @load_sext_8i8_to_8i16(<8 x i8> *%ptr) {
-; SSE2-LABEL: load_sext_8i8_to_8i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_8i8_to_8i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: psraw $8, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_8i8_to_8i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: load_sext_8i8_to_8i16:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxbw (%rdi), %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_8i8_to_8i16:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: psraw $8, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_8i8_to_8i16:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <8 x i8>, <8 x i8>* %ptr
- %Y = sext <8 x i8> %X to <8 x i16>
- ret <8 x i16> %Y
-}
-
-define <8 x i64> @load_sext_8i8_to_8i64(<8 x i8> *%ptr) {
-; SSE2-LABEL: load_sext_8i8_to_8i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: psrad $24, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_8i8_to_8i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT: psrad $24, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_8i8_to_8i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbq (%rdi), %xmm0
-; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1
-; SSE41-NEXT: pmovsxbq 4(%rdi), %xmm2
-; SSE41-NEXT: pmovsxbq 6(%rdi), %xmm3
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_sext_8i8_to_8i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxbq 6(%rdi), %xmm1
-; AVX1-NEXT: vpmovsxbq 4(%rdi), %xmm2
-; AVX1-NEXT: vpmovsxbq 2(%rdi), %xmm0
-; AVX1-NEXT: vpmovsxbq (%rdi), %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_8i8_to_8i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxbq (%rdi), %ymm0
-; AVX2-NEXT: vpmovsxbq 4(%rdi), %ymm1
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_sext_8i8_to_8i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxbq (%rdi), %zmm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_8i8_to_8i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X32-SSE2-NEXT: psrad $24, %xmm1
-; X32-SSE2-NEXT: pxor %xmm4, %xmm4
-; X32-SSE2-NEXT: pxor %xmm3, %xmm3
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; X32-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X32-SSE2-NEXT: psrad $24, %xmm3
-; X32-SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; X32-SSE2-NEXT: movdqa %xmm3, %xmm2
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_8i8_to_8i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0
-; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1
-; X32-SSE41-NEXT: pmovsxbq 4(%eax), %xmm2
-; X32-SSE41-NEXT: pmovsxbq 6(%eax), %xmm3
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <8 x i8>, <8 x i8>* %ptr
- %Y = sext <8 x i8> %X to <8 x i64>
- ret <8 x i64> %Y
-}
-
-define <8 x i32> @load_sext_8i1_to_8i32(<8 x i1> *%ptr) {
-; SSE-LABEL: load_sext_8i1_to_8i32:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pcmpeqd %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: load_sext_8i1_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_8i1_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastd (%rdi), %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_sext_8i1_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: kmovw (%rdi), %k1
-; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512-NEXT: retq
-;
-; X32-SSE-LABEL: load_sext_8i1_to_8i32:
-; X32-SSE: # %bb.0: # %entry
-; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,32,64,128]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm1
-; X32-SSE-NEXT: retl
-entry:
- %X = load <8 x i1>, <8 x i1>* %ptr
- %Y = sext <8 x i1> %X to <8 x i32>
- ret <8 x i32> %Y
-}
-
-define <8 x i32> @load_sext_8i8_to_8i32(<8 x i8> *%ptr) {
-; SSE2-LABEL: load_sext_8i8_to_8i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_8i8_to_8i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: psrad $24, %xmm0
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_8i8_to_8i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbd (%rdi), %xmm0
-; SSE41-NEXT: pmovsxbd 4(%rdi), %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_sext_8i8_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxbd 4(%rdi), %xmm0
-; AVX1-NEXT: vpmovsxbd (%rdi), %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_8i8_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxbd (%rdi), %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_sext_8i8_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxbd (%rdi), %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_8i8_to_8i32:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X32-SSE2-NEXT: psrad $24, %xmm0
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: psrad $24, %xmm1
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_8i8_to_8i32:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxbd (%eax), %xmm0
-; X32-SSE41-NEXT: pmovsxbd 4(%eax), %xmm1
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <8 x i8>, <8 x i8>* %ptr
- %Y = sext <8 x i8> %X to <8 x i32>
- ret <8 x i32> %Y
-}
-
-define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
-; SSE2-LABEL: load_sext_16i1_to_16i8:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_16i1_to_16i8:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_16i1_to_16i8:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqb %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_sext_16i1_to_16i8:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [-1.7939930131212661E-307,-1.7939930131212661E-307]
-; AVX1-NEXT: # xmm1 = mem[0,0]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_16i1_to_16i8:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [9241421688590303745,9241421688590303745]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_sext_16i1_to_16i8:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: kmovw (%rdi), %k1
-; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: load_sext_16i1_to_16i8:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: kmovw (%rdi), %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_16i1_to_16i8:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,1,1,4,5,6,7]
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X32-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; X32-SSE2-NEXT: pand %xmm1, %xmm0
-; X32-SSE2-NEXT: pcmpeqb %xmm1, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
-; X32-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; X32-SSE41-NEXT: pand %xmm1, %xmm0
-; X32-SSE41-NEXT: pcmpeqb %xmm1, %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <16 x i1>, <16 x i1>* %ptr
- %Y = sext <16 x i1> %X to <16 x i8>
- ret <16 x i8> %Y
-}
-
-define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
-; SSE-LABEL: load_sext_16i1_to_16i16:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pcmpeqw %xmm2, %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pcmpeqw %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: load_sext_16i1_to_16i16:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_16i1_to_16i16:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_sext_16i1_to_16i16:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: kmovw (%rdi), %k1
-; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: load_sext_16i1_to_16i16:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: kmovw (%rdi), %k0
-; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512BW-NEXT: retq
-;
-; X32-SSE-LABEL: load_sext_16i1_to_16i16:
-; X32-SSE: # %bb.0: # %entry
-; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pcmpeqw %xmm2, %xmm1
-; X32-SSE-NEXT: retl
-entry:
- %X = load <16 x i1>, <16 x i1>* %ptr
- %Y = sext <16 x i1> %X to <16 x i16>
- ret <16 x i16> %Y
-}
-
-define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
-; SSE-LABEL: load_sext_32i1_to_32i8:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pcmpeqb %xmm2, %xmm0
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pcmpeqb %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: load_sext_32i1_to_32i8:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_32i1_to_32i8:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9241421688590303745,9241421688590303745,9241421688590303745,9241421688590303745]
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: load_sext_32i1_to_32i8:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: kmovw (%rdi), %k1
-; AVX512F-NEXT: kmovw 2(%rdi), %k2
-; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: load_sext_32i1_to_32i8:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: kmovd (%rdi), %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512BW-NEXT: retq
-;
-; X32-SSE-LABEL: load_sext_32i1_to_32i8:
-; X32-SSE: # %bb.0: # %entry
-; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,1,1,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm0
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,2,3,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; X32-SSE-NEXT: pand %xmm2, %xmm1
-; X32-SSE-NEXT: pcmpeqb %xmm2, %xmm1
-; X32-SSE-NEXT: retl
-entry:
- %X = load <32 x i1>, <32 x i1>* %ptr
- %Y = sext <32 x i1> %X to <32 x i8>
- ret <32 x i8> %Y
-}
-
-define <16 x i16> @load_sext_16i8_to_16i16(<16 x i8> *%ptr) {
-; SSE2-LABEL: load_sext_16i8_to_16i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: psraw $8, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_16i8_to_16i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa (%rdi), %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: psraw $8, %xmm0
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSSE3-NEXT: psraw $8, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_16i8_to_16i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbw (%rdi), %xmm0
-; SSE41-NEXT: pmovsxbw 8(%rdi), %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_sext_16i8_to_16i16:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm0
-; AVX1-NEXT: vpmovsxbw (%rdi), %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_16i8_to_16i16:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxbw (%rdi), %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_sext_16i8_to_16i16:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxbw (%rdi), %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_16i8_to_16i16:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movdqa (%eax), %xmm1
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X32-SSE2-NEXT: psraw $8, %xmm0
-; X32-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; X32-SSE2-NEXT: psraw $8, %xmm1
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_16i8_to_16i16:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0
-; X32-SSE41-NEXT: pmovsxbw 8(%eax), %xmm1
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <16 x i8>, <16 x i8>* %ptr
- %Y = sext <16 x i8> %X to <16 x i16>
- ret <16 x i16> %Y
-}
-
-define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
-; SSE2-LABEL: load_sext_2i16_to_2i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_2i16_to_2i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_2i16_to_2i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: load_sext_2i16_to_2i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxwq (%rdi), %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_2i16_to_2i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
-; X32-SSE2-NEXT: pxor %xmm1, %xmm1
-; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; X32-SSE2-NEXT: psrad $16, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_2i16_to_2i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <2 x i16>, <2 x i16>* %ptr
- %Y = sext <2 x i16> %X to <2 x i64>
- ret <2 x i64> %Y
-}
-
-define <4 x i32> @load_sext_4i16_to_4i32(<4 x i16> *%ptr) {
-; SSE2-LABEL: load_sext_4i16_to_4i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_4i16_to_4i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_4i16_to_4i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxwd (%rdi), %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: load_sext_4i16_to_4i32:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxwd (%rdi), %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_4i16_to_4i32:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X32-SSE2-NEXT: psrad $16, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_4i16_to_4i32:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <4 x i16>, <4 x i16>* %ptr
- %Y = sext <4 x i16> %X to <4 x i32>
- ret <4 x i32> %Y
-}
-
-define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) {
-; SSE2-LABEL: load_sext_4i16_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_4i16_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_4i16_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxwq (%rdi), %xmm0
-; SSE41-NEXT: pmovsxwq 4(%rdi), %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_sext_4i16_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxwq 4(%rdi), %xmm0
-; AVX1-NEXT: vpmovsxwq (%rdi), %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_4i16_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxwq (%rdi), %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_sext_4i16_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxwq (%rdi), %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_4i16_to_4i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-SSE2-NEXT: psrad $16, %xmm1
-; X32-SSE2-NEXT: pxor %xmm2, %xmm2
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; X32-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_4i16_to_4i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0
-; X32-SSE41-NEXT: pmovsxwq 4(%eax), %xmm1
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <4 x i16>, <4 x i16>* %ptr
- %Y = sext <4 x i16> %X to <4 x i64>
- ret <4 x i64> %Y
-}
-
-define <8 x i32> @load_sext_8i16_to_8i32(<8 x i16> *%ptr) {
-; SSE2-LABEL: load_sext_8i16_to_8i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_8i16_to_8i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa (%rdi), %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_8i16_to_8i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxwd (%rdi), %xmm0
-; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_sext_8i16_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxwd 8(%rdi), %xmm0
-; AVX1-NEXT: vpmovsxwd (%rdi), %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_8i16_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxwd (%rdi), %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_sext_8i16_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxwd (%rdi), %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_8i16_to_8i32:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movdqa (%eax), %xmm1
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X32-SSE2-NEXT: psrad $16, %xmm0
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: psrad $16, %xmm1
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_8i16_to_8i32:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxwd (%eax), %xmm0
-; X32-SSE41-NEXT: pmovsxwd 8(%eax), %xmm1
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <8 x i16>, <8 x i16>* %ptr
- %Y = sext <8 x i16> %X to <8 x i32>
- ret <8 x i32> %Y
-}
-
-define <2 x i64> @load_sext_2i32_to_2i64(<2 x i32> *%ptr) {
-; SSE2-LABEL: load_sext_2i32_to_2i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_2i32_to_2i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_2i32_to_2i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxdq (%rdi), %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: load_sext_2i32_to_2i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxdq (%rdi), %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_2i32_to_2i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; X32-SSE2-NEXT: pxor %xmm1, %xmm1
-; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_2i32_to_2i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <2 x i32>, <2 x i32>* %ptr
- %Y = sext <2 x i32> %X to <2 x i64>
- ret <2 x i64> %Y
-}
-
-define <4 x i64> @load_sext_4i32_to_4i64(<4 x i32> *%ptr) {
-; SSE2-LABEL: load_sext_4i32_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa (%rdi), %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_sext_4i32_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa (%rdi), %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_sext_4i32_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxdq (%rdi), %xmm0
-; SSE41-NEXT: pmovsxdq 8(%rdi), %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_sext_4i32_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovsxdq 8(%rdi), %xmm0
-; AVX1-NEXT: vpmovsxdq (%rdi), %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_sext_4i32_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovsxdq (%rdi), %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_sext_4i32_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovsxdq (%rdi), %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: load_sext_4i32_to_4i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movdqa (%eax), %xmm0
-; X32-SSE2-NEXT: pxor %xmm2, %xmm2
-; X32-SSE2-NEXT: pxor %xmm3, %xmm3
-; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: load_sext_4i32_to_4i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: pmovsxdq (%eax), %xmm0
-; X32-SSE41-NEXT: pmovsxdq 8(%eax), %xmm1
-; X32-SSE41-NEXT: retl
-entry:
- %X = load <4 x i32>, <4 x i32>* %ptr
- %Y = sext <4 x i32> %X to <4 x i64>
- ret <4 x i64> %Y
-}
-
-define i32 @sext_2i8_to_i32(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_2i8_to_i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: movd %xmm0, %eax
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_2i8_to_i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: psraw $8, %xmm0
-; SSSE3-NEXT: movd %xmm0, %eax
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_2i8_to_i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
-; SSE41-NEXT: movd %xmm0, %eax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sext_2i8_to_i32:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_2i8_to_i32:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: psraw $8, %xmm0
-; X32-SSE2-NEXT: movd %xmm0, %eax
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_2i8_to_i32:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: pmovsxbw %xmm0, %xmm0
-; X32-SSE41-NEXT: movd %xmm0, %eax
-; X32-SSE41-NEXT: retl
-entry:
- %Shuf = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
- %Ex = sext <2 x i8> %Shuf to <2 x i16>
- %Bc = bitcast <2 x i16> %Ex to i32
- ret i32 %Bc
-}
-
-define <4 x i64> @sext_4i1_to_4i64(<4 x i1> %mask) {
-; SSE2-LABEL: sext_4i1_to_4i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $31, %xmm0
-; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_4i1_to_4i64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pslld $31, %xmm0
-; SSSE3-NEXT: psrad $31, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pxor %xmm3, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_4i1_to_4i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $31, %xmm0
-; SSE41-NEXT: psrad $31, %xmm0
-; SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovsxdq %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_4i1_to_4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovsxdq %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_4i1_to_4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX2-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_4i1_to_4i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX512-NEXT: vpsrad $31, %xmm0, %xmm0
-; AVX512-NEXT: vpmovsxdq %xmm0, %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_4i1_to_4i64:
-; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: pslld $31, %xmm0
-; X32-SSE2-NEXT: psrad $31, %xmm0
-; X32-SSE2-NEXT: pxor %xmm2, %xmm2
-; X32-SSE2-NEXT: pxor %xmm3, %xmm3
-; X32-SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_4i1_to_4i64:
-; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: pslld $31, %xmm0
-; X32-SSE41-NEXT: psrad $31, %xmm0
-; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxdq %xmm0, %xmm1
-; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE41-NEXT: retl
- %extmask = sext <4 x i1> %mask to <4 x i64>
- ret <4 x i64> %extmask
-}
-
-define <4 x i64> @sext_4i8_to_4i64(<4 x i8> %mask) {
-; SSE2-LABEL: sext_4i8_to_4i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: psrad $24, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_4i8_to_4i64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT: psrad $24, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_4i8_to_4i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovsxbq %xmm0, %xmm2
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmovsxbq %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_4i8_to_4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovsxbq %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxbq %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_4i8_to_4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbq %xmm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_4i8_to_4i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsxbq %xmm0, %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_4i8_to_4i64:
-; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-SSE2-NEXT: psrad $24, %xmm1
-; X32-SSE2-NEXT: pxor %xmm2, %xmm2
-; X32-SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; X32-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_4i8_to_4i64:
-; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm2
-; X32-SSE41-NEXT: psrld $16, %xmm0
-; X32-SSE41-NEXT: pmovsxbq %xmm0, %xmm1
-; X32-SSE41-NEXT: movdqa %xmm2, %xmm0
-; X32-SSE41-NEXT: retl
- %extmask = sext <4 x i8> %mask to <4 x i64>
- ret <4 x i64> %extmask
-}
-
-define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
-; SSE-LABEL: sext_32xi1_to_32xi8:
-; SSE: # %bb.0:
-; SSE-NEXT: pcmpeqw %xmm5, %xmm1
-; SSE-NEXT: pcmpeqw %xmm4, %xmm0
-; SSE-NEXT: packsswb %xmm1, %xmm0
-; SSE-NEXT: pcmpeqw %xmm7, %xmm3
-; SSE-NEXT: pcmpeqw %xmm6, %xmm2
-; SSE-NEXT: packsswb %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: sext_32xi1_to_32xi8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpcmpeqw %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_32xi1_to_32xi8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: sext_32xi1_to_32xi8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: sext_32xi1_to_32xi8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpcmpeqw %zmm1, %zmm0, %k0
-; AVX512BW-NEXT: vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
-; AVX512BW-NEXT: retq
-;
-; X32-SSE-LABEL: sext_32xi1_to_32xi8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pushl %ebp
-; X32-SSE-NEXT: movl %esp, %ebp
-; X32-SSE-NEXT: andl $-16, %esp
-; X32-SSE-NEXT: subl $16, %esp
-; X32-SSE-NEXT: movdqa 8(%ebp), %xmm3
-; X32-SSE-NEXT: pcmpeqw 40(%ebp), %xmm1
-; X32-SSE-NEXT: pcmpeqw 24(%ebp), %xmm0
-; X32-SSE-NEXT: packsswb %xmm1, %xmm0
-; X32-SSE-NEXT: pcmpeqw 72(%ebp), %xmm3
-; X32-SSE-NEXT: pcmpeqw 56(%ebp), %xmm2
-; X32-SSE-NEXT: packsswb %xmm3, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm1
-; X32-SSE-NEXT: movl %ebp, %esp
-; X32-SSE-NEXT: popl %ebp
-; X32-SSE-NEXT: retl
- %a = icmp eq <32 x i16> %c1, %c2
- %b = sext <32 x i1> %a to <32 x i8>
- ret <32 x i8> %b
-}
-
-define <2 x i32> @sext_2i8_to_2i32(<2 x i8>* %addr) {
-; SSE2-LABEL: sext_2i8_to_2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movzwl (%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_2i8_to_2i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movzwl (%rdi), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; SSSE3-NEXT: psrad $24, %xmm0
-; SSSE3-NEXT: paddd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_2i8_to_2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movzwl (%rdi), %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; SSE41-NEXT: paddd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sext_2i8_to_2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: movzwl (%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_2i8_to_2i32:
-; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movzwl (%eax), %eax
-; X32-SSE2-NEXT: movd %eax, %xmm0
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
-; X32-SSE2-NEXT: psrad $24, %xmm0
-; X32-SSE2-NEXT: paddd %xmm0, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_2i8_to_2i32:
-; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movzwl (%eax), %eax
-; X32-SSE41-NEXT: movd %eax, %xmm0
-; X32-SSE41-NEXT: pmovsxbd %xmm0, %xmm0
-; X32-SSE41-NEXT: paddd %xmm0, %xmm0
-; X32-SSE41-NEXT: retl
- %x = load <2 x i8>, <2 x i8>* %addr, align 1
- %y = sext <2 x i8> %x to <2 x i32>
- %z = add <2 x i32>%y, %y
- ret <2 x i32>%z
-}
-
-define <4 x i32> @sext_4i17_to_4i32(<4 x i17>* %ptr) {
-; SSE2-LABEL: sext_4i17_to_4i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq (%rdi), %rax
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shlq $30, %rcx
-; SSE2-NEXT: sarq $47, %rcx
-; SSE2-NEXT: movd %ecx, %xmm1
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shlq $47, %rcx
-; SSE2-NEXT: sarq $47, %rcx
-; SSE2-NEXT: movd %ecx, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movl 8(%rdi), %ecx
-; SSE2-NEXT: shll $13, %ecx
-; SSE2-NEXT: movq %rax, %rdx
-; SSE2-NEXT: shrq $51, %rdx
-; SSE2-NEXT: orl %ecx, %edx
-; SSE2-NEXT: shlq $47, %rdx
-; SSE2-NEXT: sarq $47, %rdx
-; SSE2-NEXT: movd %edx, %xmm1
-; SSE2-NEXT: shlq $13, %rax
-; SSE2-NEXT: sarq $47, %rax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_4i17_to_4i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movq (%rdi), %rax
-; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: shlq $30, %rcx
-; SSSE3-NEXT: sarq $47, %rcx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: shlq $47, %rcx
-; SSSE3-NEXT: sarq $47, %rcx
-; SSSE3-NEXT: movd %ecx, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movl 8(%rdi), %ecx
-; SSSE3-NEXT: shll $13, %ecx
-; SSSE3-NEXT: movq %rax, %rdx
-; SSSE3-NEXT: shrq $51, %rdx
-; SSSE3-NEXT: orl %ecx, %edx
-; SSSE3-NEXT: shlq $47, %rdx
-; SSSE3-NEXT: sarq $47, %rdx
-; SSSE3-NEXT: movd %edx, %xmm1
-; SSSE3-NEXT: shlq $13, %rax
-; SSSE3-NEXT: sarq $47, %rax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_4i17_to_4i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movq (%rdi), %rax
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shlq $30, %rcx
-; SSE41-NEXT: sarq $47, %rcx
-; SSE41-NEXT: movq %rax, %rdx
-; SSE41-NEXT: shlq $47, %rdx
-; SSE41-NEXT: sarq $47, %rdx
-; SSE41-NEXT: movd %edx, %xmm0
-; SSE41-NEXT: pinsrd $1, %ecx, %xmm0
-; SSE41-NEXT: movq %rax, %rcx
-; SSE41-NEXT: shlq $13, %rcx
-; SSE41-NEXT: sarq $47, %rcx
-; SSE41-NEXT: pinsrd $2, %ecx, %xmm0
-; SSE41-NEXT: movl 8(%rdi), %ecx
-; SSE41-NEXT: shll $13, %ecx
-; SSE41-NEXT: shrq $51, %rax
-; SSE41-NEXT: orl %ecx, %eax
-; SSE41-NEXT: shlq $47, %rax
-; SSE41-NEXT: sarq $47, %rax
-; SSE41-NEXT: pinsrd $3, %eax, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: sext_4i17_to_4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: movq (%rdi), %rax
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $30, %rcx
-; AVX-NEXT: sarq $47, %rcx
-; AVX-NEXT: movq %rax, %rdx
-; AVX-NEXT: shlq $47, %rdx
-; AVX-NEXT: sarq $47, %rdx
-; AVX-NEXT: vmovd %edx, %xmm0
-; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movq %rax, %rcx
-; AVX-NEXT: shlq $13, %rcx
-; AVX-NEXT: sarq $47, %rcx
-; AVX-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX-NEXT: movl 8(%rdi), %ecx
-; AVX-NEXT: shll $13, %ecx
-; AVX-NEXT: shrq $51, %rax
-; AVX-NEXT: orl %ecx, %eax
-; AVX-NEXT: shlq $47, %rax
-; AVX-NEXT: sarq $47, %rax
-; AVX-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_4i17_to_4i32:
-; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE2-NEXT: movl (%eax), %ecx
-; X32-SSE2-NEXT: movl 4(%eax), %edx
-; X32-SSE2-NEXT: movl 8(%eax), %eax
-; X32-SSE2-NEXT: shldl $13, %edx, %eax
-; X32-SSE2-NEXT: shll $15, %eax
-; X32-SSE2-NEXT: movd %eax, %xmm0
-; X32-SSE2-NEXT: movl %edx, %eax
-; X32-SSE2-NEXT: shll $13, %eax
-; X32-SSE2-NEXT: movd %eax, %xmm1
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-SSE2-NEXT: shldl $15, %ecx, %edx
-; X32-SSE2-NEXT: shll $15, %ecx
-; X32-SSE2-NEXT: movd %ecx, %xmm0
-; X32-SSE2-NEXT: shll $15, %edx
-; X32-SSE2-NEXT: movd %edx, %xmm2
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; X32-SSE2-NEXT: psrad $15, %xmm0
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_4i17_to_4i32:
-; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: pushl %esi
-; X32-SSE41-NEXT: .cfi_def_cfa_offset 8
-; X32-SSE41-NEXT: .cfi_offset %esi, -8
-; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-SSE41-NEXT: movl (%eax), %ecx
-; X32-SSE41-NEXT: movl 4(%eax), %edx
-; X32-SSE41-NEXT: movl %edx, %esi
-; X32-SSE41-NEXT: movl 8(%eax), %eax
-; X32-SSE41-NEXT: shldl $13, %edx, %eax
-; X32-SSE41-NEXT: shldl $15, %ecx, %edx
-; X32-SSE41-NEXT: shll $15, %edx
-; X32-SSE41-NEXT: shll $15, %ecx
-; X32-SSE41-NEXT: movd %ecx, %xmm0
-; X32-SSE41-NEXT: pinsrd $1, %edx, %xmm0
-; X32-SSE41-NEXT: shll $13, %esi
-; X32-SSE41-NEXT: pinsrd $2, %esi, %xmm0
-; X32-SSE41-NEXT: shll $15, %eax
-; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0
-; X32-SSE41-NEXT: psrad $15, %xmm0
-; X32-SSE41-NEXT: popl %esi
-; X32-SSE41-NEXT: .cfi_def_cfa_offset 4
-; X32-SSE41-NEXT: retl
- %a = load <4 x i17>, <4 x i17>* %ptr
- %b = sext <4 x i17> %a to <4 x i32>
- ret <4 x i32> %b
-}
-
-define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
-; SSE2-LABEL: sext_8i6_to_8i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movd %edi, %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE2-NEXT: paddw {{.*}}(%rip), %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: psllq $58, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: psrad $26, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: psllq $58, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psrad $31, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: psrad $26, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: psllq $58, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSE2-NEXT: psrad $26, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: psllq $58, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: psrad $31, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSE2-NEXT: psrad $26, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: sext_8i6_to_8i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movd %edi, %xmm0
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
-; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSSE3-NEXT: psllq $58, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: psrad $31, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSSE3-NEXT: psrad $26, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3]
-; SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
-; SSSE3-NEXT: psllq $58, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: psrad $31, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSSE3-NEXT: psrad $26, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
-; SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
-; SSSE3-NEXT: psllq $58, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSSE3-NEXT: psrad $26, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3]
-; SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7]
-; SSSE3-NEXT: psllq $58, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: psrad $31, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; SSSE3-NEXT: psrad $26, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: sext_8i6_to_8i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movd %edi, %xmm0
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE41-NEXT: paddw {{.*}}(%rip), %xmm3
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; SSE41-NEXT: psllq $58, %xmm0
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: psrad $26, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: psllq $58, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: psrad $26, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; SSE41-NEXT: psllq $58, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm4
-; SSE41-NEXT: psrad $31, %xmm4
-; SSE41-NEXT: psrad $26, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; SSE41-NEXT: psllq $58, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psrad $31, %xmm4
-; SSE41-NEXT: psrad $26, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: sext_8i6_to_8i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsllw $10, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $10, %xmm0, %xmm1
-; AVX1-NEXT: vpmovsxwq %xmm1, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
-; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovsxwq %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
-; AVX1-NEXT: vpmovsxwq %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: sext_8i6_to_8i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpsllw $10, %xmm0, %xmm0
-; AVX2-NEXT: vpsraw $10, %xmm0, %xmm1
-; AVX2-NEXT: vpmovsxwq %xmm1, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX2-NEXT: vpmovsxwq %xmm1, %ymm1
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: sext_8i6_to_8i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vmovd %edi, %xmm0
-; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT: vpsllq $58, %zmm0, %zmm0
-; AVX512-NEXT: vpsraq $58, %zmm0, %zmm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: sext_8i6_to_8i64:
-; X32-SSE2: # %bb.0: # %entry
-; X32-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; X32-SSE2-NEXT: paddw {{\.LCPI.*}}, %xmm3
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
-; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; X32-SSE2-NEXT: psllq $58, %xmm0
-; X32-SSE2-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE2-NEXT: psrad $31, %xmm1
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE2-NEXT: psrad $26, %xmm0
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3]
-; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
-; X32-SSE2-NEXT: psllq $58, %xmm1
-; X32-SSE2-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE2-NEXT: psrad $31, %xmm2
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X32-SSE2-NEXT: psrad $26, %xmm1
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
-; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
-; X32-SSE2-NEXT: psllq $58, %xmm2
-; X32-SSE2-NEXT: movdqa %xmm2, %xmm4
-; X32-SSE2-NEXT: psrad $31, %xmm4
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; X32-SSE2-NEXT: psrad $26, %xmm2
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3]
-; X32-SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7]
-; X32-SSE2-NEXT: psllq $58, %xmm3
-; X32-SSE2-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE2-NEXT: psrad $31, %xmm4
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
-; X32-SSE2-NEXT: psrad $26, %xmm3
-; X32-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
-; X32-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: sext_8i6_to_8i64:
-; X32-SSE41: # %bb.0: # %entry
-; X32-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; X32-SSE41-NEXT: paddw {{\.LCPI.*}}, %xmm3
-; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; X32-SSE41-NEXT: psllq $58, %xmm0
-; X32-SSE41-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE41-NEXT: psrad $31, %xmm1
-; X32-SSE41-NEXT: psrad $26, %xmm0
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; X32-SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
-; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; X32-SSE41-NEXT: psllq $58, %xmm1
-; X32-SSE41-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE41-NEXT: psrad $31, %xmm2
-; X32-SSE41-NEXT: psrad $26, %xmm1
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; X32-SSE41-NEXT: psllq $58, %xmm2
-; X32-SSE41-NEXT: movdqa %xmm2, %xmm4
-; X32-SSE41-NEXT: psrad $31, %xmm4
-; X32-SSE41-NEXT: psrad $26, %xmm2
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X32-SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; X32-SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; X32-SSE41-NEXT: psllq $58, %xmm3
-; X32-SSE41-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE41-NEXT: psrad $31, %xmm4
-; X32-SSE41-NEXT: psrad $26, %xmm3
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; X32-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
-; X32-SSE41-NEXT: retl
-entry:
- %a = trunc i32 %x to i6
- %b = insertelement <8 x i6> undef, i6 %a, i32 0
- %c = shufflevector <8 x i6> %b, <8 x i6> undef, <8 x i32> zeroinitializer
- %d = add <8 x i6> %c, <i6 0, i6 1, i6 2, i6 3, i6 4, i6 5, i6 6, i6 7>
- %e = sext <8 x i6> %d to <8 x i64>
- ret <8 x i64> %e
-}
-
-define <8 x i32> @zext_negate_sext(<8 x i8> %x) {
-; SSE2-LABEL: zext_negate_sext:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: psubw %xmm0, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_negate_sext:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: psubw %xmm0, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_negate_sext:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: psubw %xmm0, %xmm1
-; SSE41-NEXT: pmovsxwd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_negate_sext:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_negate_sext:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpsubd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_negate_sext:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpsubd %ymm0, %ymm1, %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: zext_negate_sext:
-; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: pxor %xmm1, %xmm1
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X32-SSE2-NEXT: psubw %xmm0, %xmm1
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X32-SSE2-NEXT: psrad $16, %xmm0
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: psrad $16, %xmm1
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: zext_negate_sext:
-; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X32-SSE41-NEXT: pxor %xmm1, %xmm1
-; X32-SSE41-NEXT: psubw %xmm0, %xmm1
-; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm1
-; X32-SSE41-NEXT: retl
- %z = zext <8 x i8> %x to <8 x i16>
- %neg = sub nsw <8 x i16> zeroinitializer, %z
- %r = sext <8 x i16> %neg to <8 x i32>
- ret <8 x i32> %r
-}
-
-define <8 x i32> @zext_decremenet_sext(<8 x i8> %x) {
-; SSE2-LABEL: zext_decremenet_sext:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE2-NEXT: paddw %xmm0, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_decremenet_sext:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1
-; SSSE3-NEXT: paddw %xmm0, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_decremenet_sext:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; SSE41-NEXT: paddw %xmm0, %xmm1
-; SSE41-NEXT: pmovsxwd %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE41-NEXT: pmovsxwd %xmm1, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_decremenet_sext:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_decremenet_sext:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_decremenet_sext:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: retq
-;
-; X32-SSE2-LABEL: zext_decremenet_sext:
-; X32-SSE2: # %bb.0:
-; X32-SSE2-NEXT: pxor %xmm1, %xmm1
-; X32-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X32-SSE2-NEXT: pcmpeqd %xmm1, %xmm1
-; X32-SSE2-NEXT: paddw %xmm0, %xmm1
-; X32-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; X32-SSE2-NEXT: psrad $16, %xmm0
-; X32-SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
-; X32-SSE2-NEXT: psrad $16, %xmm1
-; X32-SSE2-NEXT: retl
-;
-; X32-SSE41-LABEL: zext_decremenet_sext:
-; X32-SSE41: # %bb.0:
-; X32-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; X32-SSE41-NEXT: pcmpeqd %xmm1, %xmm1
-; X32-SSE41-NEXT: paddw %xmm0, %xmm1
-; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm0
-; X32-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE41-NEXT: pmovsxwd %xmm1, %xmm1
-; X32-SSE41-NEXT: retl
- %z = zext <8 x i8> %x to <8 x i16>
- %dec = add <8 x i16> %z, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
- %r = sext <8 x i16> %dec to <8 x i32>
- ret <8 x i32> %r
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
-;
-; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
-
-;
-; Variable Shifts
-;
-
-define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrad %xmm2, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrad %xmm4, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrad %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrad %xmm1, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrad %xmm4, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrad %xmm1, %xmm3
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrad %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrad %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: var_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshad %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsravd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrad %xmm2, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrad %xmm4, %xmm2
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrad %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrad %xmm1, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i32> %a, %b
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psraw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3
-; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: var_shift_v4i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v4i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v4i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v4i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v4i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v4i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $12, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psraw $8, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psraw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psraw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: psraw $15, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: psraw $1, %xmm0
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <4 x i16> %a, %b
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $8, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psraw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm3
-; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $4, %xmm0, %xmm1
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsraw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: var_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshaw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v2i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQ-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v2i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQVL-NEXT: vpsravd %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v2i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsravw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $12, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psraw $8, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psraw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psraw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: psraw $15, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: psraw $1, %xmm0
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i16> %a, %b
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT: psllw $5, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: psraw $4, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: psraw $2, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: psraw $1, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $4, %xmm4
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $2, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $1, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: var_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: var_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v8i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v8i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v8i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; X32-SSE-NEXT: psllw $5, %xmm1
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm5, %xmm6
-; X32-SSE-NEXT: pandn %xmm2, %xmm6
-; X32-SSE-NEXT: psraw $4, %xmm2
-; X32-SSE-NEXT: pand %xmm5, %xmm2
-; X32-SSE-NEXT: por %xmm6, %xmm2
-; X32-SSE-NEXT: paddw %xmm4, %xmm4
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm5, %xmm6
-; X32-SSE-NEXT: pandn %xmm2, %xmm6
-; X32-SSE-NEXT: psraw $2, %xmm2
-; X32-SSE-NEXT: pand %xmm5, %xmm2
-; X32-SSE-NEXT: por %xmm6, %xmm2
-; X32-SSE-NEXT: paddw %xmm4, %xmm4
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm5, %xmm4
-; X32-SSE-NEXT: pandn %xmm2, %xmm4
-; X32-SSE-NEXT: psraw $1, %xmm2
-; X32-SSE-NEXT: pand %xmm5, %xmm2
-; X32-SSE-NEXT: por %xmm4, %xmm2
-; X32-SSE-NEXT: psrlw $8, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pxor %xmm4, %xmm4
-; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
-; X32-SSE-NEXT: movdqa %xmm4, %xmm5
-; X32-SSE-NEXT: pandn %xmm0, %xmm5
-; X32-SSE-NEXT: psraw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm4, %xmm0
-; X32-SSE-NEXT: por %xmm5, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: pxor %xmm4, %xmm4
-; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
-; X32-SSE-NEXT: movdqa %xmm4, %xmm5
-; X32-SSE-NEXT: pandn %xmm0, %xmm5
-; X32-SSE-NEXT: psraw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm4, %xmm0
-; X32-SSE-NEXT: por %xmm5, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm1
-; X32-SSE-NEXT: psraw $1, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
-; X32-SSE-NEXT: psrlw $8, %xmm0
-; X32-SSE-NEXT: packuswb %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <8 x i8> %a, %b
- ret <8 x i8> %shift
-}
-
-define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT: psllw $5, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: psraw $4, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: psraw $2, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: psraw $1, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $4, %xmm4
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $2, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $1, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: var_shift_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: var_shift_v4i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v4i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v4i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v4i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; X32-SSE-NEXT: psllw $5, %xmm1
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm5, %xmm6
-; X32-SSE-NEXT: pandn %xmm2, %xmm6
-; X32-SSE-NEXT: psraw $4, %xmm2
-; X32-SSE-NEXT: pand %xmm5, %xmm2
-; X32-SSE-NEXT: por %xmm6, %xmm2
-; X32-SSE-NEXT: paddw %xmm4, %xmm4
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm5, %xmm6
-; X32-SSE-NEXT: pandn %xmm2, %xmm6
-; X32-SSE-NEXT: psraw $2, %xmm2
-; X32-SSE-NEXT: pand %xmm5, %xmm2
-; X32-SSE-NEXT: por %xmm6, %xmm2
-; X32-SSE-NEXT: paddw %xmm4, %xmm4
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm5, %xmm4
-; X32-SSE-NEXT: pandn %xmm2, %xmm4
-; X32-SSE-NEXT: psraw $1, %xmm2
-; X32-SSE-NEXT: pand %xmm5, %xmm2
-; X32-SSE-NEXT: por %xmm4, %xmm2
-; X32-SSE-NEXT: psrlw $8, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pxor %xmm4, %xmm4
-; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
-; X32-SSE-NEXT: movdqa %xmm4, %xmm5
-; X32-SSE-NEXT: pandn %xmm0, %xmm5
-; X32-SSE-NEXT: psraw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm4, %xmm0
-; X32-SSE-NEXT: por %xmm5, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: pxor %xmm4, %xmm4
-; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
-; X32-SSE-NEXT: movdqa %xmm4, %xmm5
-; X32-SSE-NEXT: pandn %xmm0, %xmm5
-; X32-SSE-NEXT: psraw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm4, %xmm0
-; X32-SSE-NEXT: por %xmm5, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm1
-; X32-SSE-NEXT: psraw $1, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
-; X32-SSE-NEXT: psrlw $8, %xmm0
-; X32-SSE-NEXT: packuswb %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <4 x i8> %a, %b
- ret <4 x i8> %shift
-}
-
-define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; SSE2-NEXT: psllw $5, %xmm1
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: psraw $4, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: psraw $2, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: paddw %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pcmpgtw %xmm4, %xmm5
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pandn %xmm2, %xmm4
-; SSE2-NEXT: psraw $1, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: por %xmm4, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $4, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pandn %xmm0, %xmm5
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtw %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: psraw $1, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $4, %xmm4
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $2, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm4
-; SSE41-NEXT: psraw $1, %xmm4
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm3
-; SSE41-NEXT: psrlw $8, %xmm3
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE41-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $4, %xmm2
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $2, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psraw $1, %xmm2
-; SSE41-NEXT: paddw %xmm0, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: var_shift_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX-NEXT: vpsraw $4, %xmm3, %xmm4
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpsraw $2, %xmm3, %xmm4
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
-; AVX-NEXT: vpsraw $1, %xmm3, %xmm4
-; AVX-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
-; AVX-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX-NEXT: vpsraw $4, %xmm0, %xmm3
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $2, %xmm0, %xmm3
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsraw $1, %xmm0, %xmm3
-; AVX-NEXT: vpaddw %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: var_shift_v2i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v2i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v2i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
-; X32-SSE-NEXT: psllw $5, %xmm1
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm5, %xmm6
-; X32-SSE-NEXT: pandn %xmm2, %xmm6
-; X32-SSE-NEXT: psraw $4, %xmm2
-; X32-SSE-NEXT: pand %xmm5, %xmm2
-; X32-SSE-NEXT: por %xmm6, %xmm2
-; X32-SSE-NEXT: paddw %xmm4, %xmm4
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm5, %xmm6
-; X32-SSE-NEXT: pandn %xmm2, %xmm6
-; X32-SSE-NEXT: psraw $2, %xmm2
-; X32-SSE-NEXT: pand %xmm5, %xmm2
-; X32-SSE-NEXT: por %xmm6, %xmm2
-; X32-SSE-NEXT: paddw %xmm4, %xmm4
-; X32-SSE-NEXT: pxor %xmm5, %xmm5
-; X32-SSE-NEXT: pcmpgtw %xmm4, %xmm5
-; X32-SSE-NEXT: movdqa %xmm5, %xmm4
-; X32-SSE-NEXT: pandn %xmm2, %xmm4
-; X32-SSE-NEXT: psraw $1, %xmm2
-; X32-SSE-NEXT: pand %xmm5, %xmm2
-; X32-SSE-NEXT: por %xmm4, %xmm2
-; X32-SSE-NEXT: psrlw $8, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pxor %xmm4, %xmm4
-; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
-; X32-SSE-NEXT: movdqa %xmm4, %xmm5
-; X32-SSE-NEXT: pandn %xmm0, %xmm5
-; X32-SSE-NEXT: psraw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm4, %xmm0
-; X32-SSE-NEXT: por %xmm5, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: pxor %xmm4, %xmm4
-; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm4
-; X32-SSE-NEXT: movdqa %xmm4, %xmm5
-; X32-SSE-NEXT: pandn %xmm0, %xmm5
-; X32-SSE-NEXT: psraw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm4, %xmm0
-; X32-SSE-NEXT: por %xmm5, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: pcmpgtw %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm1
-; X32-SSE-NEXT: psraw $1, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
-; X32-SSE-NEXT: psrlw $8, %xmm0
-; X32-SSE-NEXT: packuswb %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i8> %a, %b
- ret <2 x i8> %shift
-}
-
-;
-; Uniform Variable Shifts
-;
-
-define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; SSE2-NEXT: psrad %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT: psrad %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: splatvar_shift_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatvar_shift_v2i32:
-; XOP: # %bb.0:
-; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; XOP-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatvar_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX512-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatvar_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX512VL-NEXT: vpsrad %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: xorps %xmm2, %xmm2
-; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X32-SSE-NEXT: psrad %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
- %shift = ashr <2 x i32> %a, %splat
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psraw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: psraw %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: splatvar_shift_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatvar_shift_v4i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatvar_shift_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatvar_shift_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v4i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psraw %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
- %shift = ashr <4 x i16> %a, %splat
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psraw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: psraw %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: splatvar_shift_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatvar_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; XOP-NEXT: vpsraw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatvar_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512-NEXT: vpsraw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatvar_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vpsraw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psraw %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
- %shift = ashr <2 x i16> %a, %splat
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psrlw %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: psrlw %xmm1, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; SSE2-NEXT: psrlw %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: psubb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psrlw %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: psrlw %xmm1, %xmm2
-; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; SSE41-NEXT: psrlw %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: psubb %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatvar_shift_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v8i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: splatvar_shift_v8i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512DQ-LABEL: splatvar_shift_v8i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: splatvar_shift_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: splatvar_shift_v8i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: splatvar_shift_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v8i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psrlw %xmm1, %xmm0
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT: psrlw %xmm1, %xmm2
-; X32-SSE-NEXT: psrlw $8, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; X32-SSE-NEXT: psrlw %xmm1, %xmm2
-; X32-SSE-NEXT: pxor %xmm2, %xmm0
-; X32-SSE-NEXT: psubb %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
- %shift = ashr <8 x i8> %a, %splat
- ret <8 x i8> %shift
-}
-
-define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psrlw %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: psrlw %xmm1, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; SSE2-NEXT: psrlw %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: psubb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psrlw %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: psrlw %xmm1, %xmm2
-; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; SSE41-NEXT: psrlw %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: psubb %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatvar_shift_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshab %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: splatvar_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshab %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512DQ-LABEL: splatvar_shift_v4i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: splatvar_shift_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: splatvar_shift_v4i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: splatvar_shift_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v4i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psrlw %xmm1, %xmm0
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT: psrlw %xmm1, %xmm2
-; X32-SSE-NEXT: psrlw $8, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; X32-SSE-NEXT: psrlw %xmm1, %xmm2
-; X32-SSE-NEXT: pxor %xmm2, %xmm0
-; X32-SSE-NEXT: psubb %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
- %shift = ashr <4 x i8> %a, %splat
- ret <4 x i8> %shift
-}
-
-define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psrlw %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: psrlw %xmm1, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; SSE2-NEXT: psrlw %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: psubb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psrlw %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: psrlw %xmm1, %xmm2
-; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; SSE41-NEXT: psrlw %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: psubb %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatvar_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX2-NEXT: vpbroadcastb %xmm2, %xmm2
-; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: splatvar_shift_v2i8:
-; XOP: # %bb.0:
-; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshab %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: splatvar_shift_v2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: splatvar_shift_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: splatvar_shift_v2i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsravd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: splatvar_shift_v2i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v2i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psrlw %xmm1, %xmm0
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT: psrlw %xmm1, %xmm2
-; X32-SSE-NEXT: psrlw $8, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
-; X32-SSE-NEXT: psrlw %xmm1, %xmm2
-; X32-SSE-NEXT: pxor %xmm2, %xmm0
-; X32-SSE-NEXT: psubb %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
- %shift = ashr <2 x i8> %a, %splat
- ret <2 x i8> %shift
-}
-
-;
-; Constant Shifts
-;
-
-define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
-; SSE2-LABEL: constant_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrad $4, %xmm1
-; SSE2-NEXT: psrad $5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrad $5, %xmm1
-; SSE41-NEXT: psrad $4, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: constant_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrad $5, %xmm0, %xmm1
-; AVX1-NEXT: vpsrad $4, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: constant_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshad {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: constant_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: constant_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: constant_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrad $4, %xmm1
-; X32-SSE-NEXT: psrad $5, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i32> %a, <i32 4, i32 5>
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
-; SSE2-LABEL: constant_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psraw $2, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; SSE2-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: andps %xmm2, %xmm0
-; SSE2-NEXT: psraw $1, %xmm1
-; SSE2-NEXT: andnps %xmm1, %xmm2
-; SSE2-NEXT: orps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u>
-; SSE41-NEXT: pmulhw %xmm0, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE41-NEXT: psraw $1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: constant_shift_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmulhw {{.*}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT: vpsraw $1, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
-; AVX-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v4i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v4i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v4i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v4i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovsxwd %xmm0, %ymm0
-; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v4i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v4i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psraw $2, %xmm1
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; X32-SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,65535,65535,65535]
-; X32-SSE-NEXT: movaps %xmm1, %xmm0
-; X32-SSE-NEXT: andps %xmm2, %xmm0
-; X32-SSE-NEXT: psraw $1, %xmm1
-; X32-SSE-NEXT: andnps %xmm1, %xmm2
-; X32-SSE-NEXT: orps %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
-; SSE2-LABEL: constant_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psraw $3, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: psraw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psraw $3, %xmm1
-; SSE41-NEXT: psraw $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: constant_shift_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsraw $3, %xmm0, %xmm1
-; AVX-NEXT: vpsraw $2, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshaw {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v2i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsraw $3, %xmm0, %xmm1
-; AVX512DQ-NEXT: vpsraw $2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v2i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpsraw $3, %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpsraw $2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v2i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psraw $3, %xmm1
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
-; X32-SSE-NEXT: psraw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pandn %xmm1, %xmm2
-; X32-SSE-NEXT: por %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i16> %a, <i16 2, i16 3>
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
-; SSE-LABEL: constant_shift_v8i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: psraw $8, %xmm0
-; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE-NEXT: psrlw $8, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: constant_shift_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v8i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v8i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v8i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pxor %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: psraw $8, %xmm0
-; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrlw $8, %xmm0
-; X32-SSE-NEXT: packuswb %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
- ret <8 x i8> %shift
-}
-
-define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
-; SSE-LABEL: constant_shift_v4i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: psraw $8, %xmm0
-; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE-NEXT: psrlw $8, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: constant_shift_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v4i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v4i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v4i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v4i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pxor %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: psraw $8, %xmm0
-; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrlw $8, %xmm0
-; X32-SSE-NEXT: packuswb %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
- ret <4 x i8> %shift
-}
-
-define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
-; SSE-LABEL: constant_shift_v2i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm1, %xmm1
-; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: psraw $8, %xmm0
-; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE-NEXT: psrlw $8, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: constant_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v2i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQ-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v2i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovsxbd %xmm0, %zmm0
-; AVX512DQVL-NEXT: vpsravd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v2i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovsxbw %xmm0, %ymm0
-; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v2i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pxor %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: psraw $8, %xmm0
-; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrlw $8, %xmm0
-; X32-SSE-NEXT: packuswb %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i8> %a, <i8 2, i8 3>
- ret <2 x i8> %shift
-}
-
-;
-; Uniform Constant Shifts
-;
-
-define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: psrad $5, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrad $5, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v2i32:
-; XOP: # %bb.0:
-; XOP-NEXT: vpsrad $5, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrad $5, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrad $5, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psrad $5, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i32> %a, <i32 5, i32 5>
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: psraw $3, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsraw $3, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v4i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpsraw $3, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v4i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psraw $3, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: psraw $3, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsraw $3, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpsraw $3, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsraw $3, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsraw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psraw $3, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i16> %a, <i16 3, i16 3>
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v8i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psrlw $3, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: psubb %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v8i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psrlw $3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; X32-SSE-NEXT: pxor %xmm1, %xmm0
-; X32-SSE-NEXT: psubb %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- ret <8 x i8> %shift
-}
-
-define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v4i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psrlw $3, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: psubb %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v4i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v4i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psrlw $3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; X32-SSE-NEXT: pxor %xmm1, %xmm0
-; X32-SSE-NEXT: psubb %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
- ret <4 x i8> %shift
-}
-
-define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v2i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psrlw $3, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; SSE-NEXT: pxor %xmm1, %xmm0
-; SSE-NEXT: psubb %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v2i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshab {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v2i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psrlw $3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
-; X32-SSE-NEXT: pxor %xmm1, %xmm0
-; X32-SSE-NEXT: psubb %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = ashr <2 x i8> %a, <i8 3, i8 3>
- ret <2 x i8> %shift
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
-;
-; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
-
-;
-; Variable Shifts
-;
-
-define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: psrld %xmm2, %xmm3
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: psrld %xmm4, %xmm2
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: psrld %xmm3, %xmm4
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE2-NEXT: psrld %xmm1, %xmm0
-; SSE2-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
-; SSE2-NEXT: movaps %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrld %xmm2, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm5
-; SSE41-NEXT: psrld %xmm4, %xmm5
-; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrld %xmm1, %xmm3
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
-; SSE41-NEXT: psrld %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: var_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubd %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlvd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm3
-; X32-SSE-NEXT: psrld %xmm2, %xmm3
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: psrld %xmm4, %xmm2
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; X32-SSE-NEXT: movdqa %xmm0, %xmm4
-; X32-SSE-NEXT: psrld %xmm3, %xmm4
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
-; X32-SSE-NEXT: psrld %xmm1, %xmm0
-; X32-SSE-NEXT: punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
-; X32-SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
-; X32-SSE-NEXT: movaps %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i32> %a, %b
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
-; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: var_shift_v4i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v4i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v4i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v4i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v4i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v4i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $12, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psrlw $8, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: psraw $15, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <4 x i16> %a, %b
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $12, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: psraw $15, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm0, %xmm3
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: paddw %xmm1, %xmm1
-; SSE2-NEXT: psraw $15, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: psllw $12, %xmm0
-; SSE41-NEXT: psllw $4, %xmm2
-; SSE41-NEXT: por %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddw %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm4
-; SSE41-NEXT: psrlw $8, %xmm4
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $4, %xmm2
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $2, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm2
-; SSE41-NEXT: psrlw $1, %xmm2
-; SSE41-NEXT: paddw %xmm3, %xmm3
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllw $12, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1
-; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm2
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm3
-; AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1
-; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: var_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubw %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v2i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v2i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQVL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v2i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlvw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $12, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psrlw $8, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: psraw $15, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm3
-; X32-SSE-NEXT: pandn %xmm0, %xmm3
-; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm3, %xmm0
-; X32-SSE-NEXT: paddw %xmm1, %xmm1
-; X32-SSE-NEXT: psraw $15, %xmm1
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i16> %a, %b
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $5, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlw $4, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlw $2, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlw $1, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: var_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: var_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v8i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v8i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v8i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $5, %xmm1
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm0, %xmm4
-; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm4, %xmm0
-; X32-SSE-NEXT: paddb %xmm1, %xmm1
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm0, %xmm4
-; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm4, %xmm0
-; X32-SSE-NEXT: paddb %xmm1, %xmm1
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm1
-; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <8 x i8> %a, %b
- ret <8 x i8> %shift
-}
-
-define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $5, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlw $4, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlw $2, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlw $1, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: var_shift_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: var_shift_v4i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v4i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v4i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v4i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $5, %xmm1
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm0, %xmm4
-; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm4, %xmm0
-; X32-SSE-NEXT: paddb %xmm1, %xmm1
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm0, %xmm4
-; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm4, %xmm0
-; X32-SSE-NEXT: paddb %xmm1, %xmm1
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm1
-; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <4 x i8> %a, %b
- ret <4 x i8> %shift
-}
-
-define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $5, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $4, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: psrlw $1, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psrlw $4, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlw $2, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psrlw $1, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: var_shift_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpsrlw $4, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsrlw $1, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: var_shift_v2i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v2i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v2i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $5, %xmm1
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm0, %xmm4
-; X32-SSE-NEXT: psrlw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm4, %xmm0
-; X32-SSE-NEXT: paddb %xmm1, %xmm1
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm0, %xmm4
-; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm4, %xmm0
-; X32-SSE-NEXT: paddb %xmm1, %xmm1
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm1
-; X32-SSE-NEXT: psrlw $1, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i8> %a, %b
- ret <2 x i8> %shift
-}
-
-;
-; Uniform Variable Shifts
-;
-
-define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; SSE2-NEXT: psrld %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT: psrld %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: splatvar_shift_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatvar_shift_v2i32:
-; XOP: # %bb.0:
-; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; XOP-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatvar_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX512-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatvar_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX512VL-NEXT: vpsrld %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: xorps %xmm2, %xmm2
-; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X32-SSE-NEXT: psrld %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
- %shift = lshr <2 x i32> %a, %splat
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psrlw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: psrlw %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: splatvar_shift_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatvar_shift_v4i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatvar_shift_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatvar_shift_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v4i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psrlw %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
- %shift = lshr <4 x i16> %a, %splat
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psrlw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: psrlw %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: splatvar_shift_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatvar_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; XOP-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatvar_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatvar_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psrlw %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
- %shift = lshr <2 x i16> %a, %splat
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psrlw %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: psrlw %xmm1, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psrlw %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: psrlw %xmm1, %xmm2
-; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatvar_shift_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v8i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: splatvar_shift_v8i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512DQ-LABEL: splatvar_shift_v8i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: splatvar_shift_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: splatvar_shift_v8i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: splatvar_shift_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v8i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psrlw %xmm1, %xmm0
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT: psrlw %xmm1, %xmm2
-; X32-SSE-NEXT: psrlw $8, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
- %shift = lshr <8 x i8> %a, %splat
- ret <8 x i8> %shift
-}
-
-define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psrlw %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: psrlw %xmm1, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psrlw %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: psrlw %xmm1, %xmm2
-; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatvar_shift_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: splatvar_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOPAVX2-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512DQ-LABEL: splatvar_shift_v4i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: splatvar_shift_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: splatvar_shift_v4i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: splatvar_shift_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v4i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psrlw %xmm1, %xmm0
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT: psrlw %xmm1, %xmm2
-; X32-SSE-NEXT: psrlw $8, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
- %shift = lshr <4 x i8> %a, %splat
- ret <4 x i8> %shift
-}
-
-define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psrlw %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: psrlw %xmm1, %xmm2
-; SSE2-NEXT: psrlw $8, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psrlw %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: psrlw %xmm1, %xmm2
-; SSE41-NEXT: pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatvar_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsrlw %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: splatvar_shift_v2i8:
-; XOP: # %bb.0:
-; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; XOP-NEXT: vpsubb %xmm1, %xmm2, %xmm1
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: splatvar_shift_v2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: splatvar_shift_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: splatvar_shift_v2i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsrlvd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: splatvar_shift_v2i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v2i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psrlw %xmm1, %xmm0
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT: psrlw %xmm1, %xmm2
-; X32-SSE-NEXT: psrlw $8, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
- %shift = lshr <2 x i8> %a, %splat
- ret <2 x i8> %shift
-}
-
-;
-; Constant Shifts
-;
-
-define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
-; SSE2-LABEL: constant_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrld $4, %xmm1
-; SSE2-NEXT: psrld $5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $5, %xmm1
-; SSE41-NEXT: psrld $4, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: constant_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrld $5, %xmm0, %xmm1
-; AVX1-NEXT: vpsrld $4, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: constant_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: constant_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: constant_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: constant_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrld $4, %xmm1
-; X32-SSE-NEXT: psrld $5, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i32> %a, <i32 4, i32 5>
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
-; SSE2-LABEL: constant_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm0, %xmm2
-; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u>
-; SSE41-NEXT: pmulhuw %xmm0, %xmm1
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: constant_shift_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v4i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v4i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v4i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v4i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmulhuw {{.*}}(%rip), %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v4i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v4i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: pandn %xmm0, %xmm2
-; X32-SSE-NEXT: pmulhuw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: por %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
-; SSE2-LABEL: constant_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrlw $3, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
-; SSE2-NEXT: psrlw $2, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm1, %xmm2
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrlw $3, %xmm1
-; SSE41-NEXT: psrlw $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: constant_shift_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm1
-; AVX-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v2i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsrlw $3, %xmm0, %xmm1
-; AVX512DQ-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v2i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpsrlw $3, %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpsrlw $2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v2i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: psrlw $3, %xmm1
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
-; X32-SSE-NEXT: psrlw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: pandn %xmm1, %xmm2
-; X32-SSE-NEXT: por %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i16> %a, <i16 2, i16 3>
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
-; SSE2-LABEL: constant_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: constant_shift_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v8i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v8i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v8i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pxor %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrlw $8, %xmm0
-; X32-SSE-NEXT: packuswb %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
- ret <8 x i8> %shift
-}
-
-define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
-; SSE2-LABEL: constant_shift_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: constant_shift_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v4i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v4i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v4i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v4i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pxor %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrlw $8, %xmm0
-; X32-SSE-NEXT: packuswb %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
- ret <4 x i8> %shift
-}
-
-define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
-; SSE2-LABEL: constant_shift_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: psrlw $8, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm1
-; SSE41-NEXT: psrlw $8, %xmm1
-; SSE41-NEXT: packuswb %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: constant_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v2i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v2i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v2i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v2i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pxor %xmm1, %xmm1
-; X32-SSE-NEXT: movdqa %xmm0, %xmm2
-; X32-SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: psrlw $8, %xmm0
-; X32-SSE-NEXT: packuswb %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i8> %a, <i8 2, i8 3>
- ret <2 x i8> %shift
-}
-
-;
-; Uniform Constant Shifts
-;
-
-define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: psrld $5, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrld $5, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v2i32:
-; XOP: # %bb.0:
-; XOP-NEXT: vpsrld $5, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrld $5, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrld $5, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psrld $5, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i32> %a, <i32 5, i32 5>
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: psrlw $3, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v4i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v4i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psrlw $3, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: psrlw $3, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpsrlw $3, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psrlw $3, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i16> %a, <i16 3, i16 3>
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v8i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psrlw $3, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v8i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psrlw $3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- ret <8 x i8> %shift
-}
-
-define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v4i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psrlw $3, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v4i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v4i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psrlw $3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
- ret <4 x i8> %shift
-}
-
-define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v2i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psrlw $3, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v2i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v2i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psrlw $3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: retl
- %shift = lshr <2 x i8> %a, <i8 3, i8 3>
- ret <2 x i8> %shift
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL
-;
-; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
-
-;
-; Variable Shifts
-;
-
-define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pmulld %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: var_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshld %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: var_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: var_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: var_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllvd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE-NEXT: retl
- %shift = shl <2 x i32> %a, %b
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: pslld $23, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm3, %xmm2
-; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: pmullw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm3, %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pslld $23, %xmm2
-; SSE41-NEXT: paddd %xmm3, %xmm2
-; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE41-NEXT: packusdw %xmm1, %xmm2
-; SSE41-NEXT: pmullw %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v4i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v4i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: var_shift_v4i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v4i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v4i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v4i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v4i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v4i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X32-SSE-NEXT: pslld $23, %xmm2
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; X32-SSE-NEXT: paddd %xmm3, %xmm2
-; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd %xmm3, %xmm1
-; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X32-SSE-NEXT: pmullw %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <4 x i16> %a, %b
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT: pslld $23, %xmm2
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; SSE2-NEXT: paddd %xmm3, %xmm2
-; SSE2-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; SSE2-NEXT: pmullw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; SSE41-NEXT: paddd %xmm3, %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pslld $23, %xmm2
-; SSE41-NEXT: paddd %xmm3, %xmm2
-; SSE41-NEXT: cvttps2dq %xmm2, %xmm2
-; SSE41-NEXT: packusdw %xmm1, %xmm2
-; SSE41-NEXT: pmullw %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: var_shift_v2i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpslld $23, %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vcvttps2dq %xmm2, %xmm2
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: var_shift_v2i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: var_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v2i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQ-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v2i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512DQVL-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512DQVL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0
-; AVX512DQVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v2i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsllvw %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm1, %xmm2
-; X32-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; X32-SSE-NEXT: pslld $23, %xmm2
-; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
-; X32-SSE-NEXT: paddd %xmm3, %xmm2
-; X32-SSE-NEXT: cvttps2dq %xmm2, %xmm2
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; X32-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd %xmm3, %xmm1
-; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; X32-SSE-NEXT: pmullw %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <2 x i16> %a, %b
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $5, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $4, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $2, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllw $4, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psllw $2, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: var_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: var_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v8i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v8i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v8i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $5, %xmm1
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm0, %xmm4
-; X32-SSE-NEXT: psllw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm4, %xmm0
-; X32-SSE-NEXT: paddb %xmm1, %xmm1
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm0, %xmm4
-; X32-SSE-NEXT: psllw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm4, %xmm0
-; X32-SSE-NEXT: paddb %xmm1, %xmm1
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm1
-; X32-SSE-NEXT: paddb %xmm0, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <8 x i8> %a, %b
- ret <8 x i8> %shift
-}
-
-define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $5, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $4, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $2, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllw $4, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psllw $2, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: var_shift_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: var_shift_v4i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v4i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v4i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v4i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $5, %xmm1
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm0, %xmm4
-; X32-SSE-NEXT: psllw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm4, %xmm0
-; X32-SSE-NEXT: paddb %xmm1, %xmm1
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm0, %xmm4
-; X32-SSE-NEXT: psllw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm4, %xmm0
-; X32-SSE-NEXT: paddb %xmm1, %xmm1
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm1
-; X32-SSE-NEXT: paddb %xmm0, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <4 x i8> %a, %b
- ret <4 x i8> %shift
-}
-
-define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
-; SSE2-LABEL: var_shift_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: psllw $5, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $4, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pandn %xmm0, %xmm4
-; SSE2-NEXT: psllw $2, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: paddb %xmm1, %xmm1
-; SSE2-NEXT: pcmpgtb %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm0, %xmm1
-; SSE2-NEXT: paddb %xmm0, %xmm0
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: var_shift_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: psllw $5, %xmm1
-; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllw $4, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: psllw $2, %xmm3
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm2, %xmm3
-; SSE41-NEXT: paddb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pblendvb %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: var_shift_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $5, %xmm1, %xmm1
-; AVX-NEXT: vpsllw $4, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpsllw $2, %xmm0, %xmm2
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm2, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpaddb %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vpaddb %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: var_shift_v2i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: var_shift_v2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: var_shift_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: var_shift_v2i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: var_shift_v2i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: var_shift_v2i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $5, %xmm1
-; X32-SSE-NEXT: pxor %xmm2, %xmm2
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm0, %xmm4
-; X32-SSE-NEXT: psllw $4, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm4, %xmm0
-; X32-SSE-NEXT: paddb %xmm1, %xmm1
-; X32-SSE-NEXT: pxor %xmm3, %xmm3
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm3
-; X32-SSE-NEXT: movdqa %xmm3, %xmm4
-; X32-SSE-NEXT: pandn %xmm0, %xmm4
-; X32-SSE-NEXT: psllw $2, %xmm0
-; X32-SSE-NEXT: pand %xmm3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: por %xmm4, %xmm0
-; X32-SSE-NEXT: paddb %xmm1, %xmm1
-; X32-SSE-NEXT: pcmpgtb %xmm1, %xmm2
-; X32-SSE-NEXT: movdqa %xmm2, %xmm1
-; X32-SSE-NEXT: pandn %xmm0, %xmm1
-; X32-SSE-NEXT: paddb %xmm0, %xmm0
-; X32-SSE-NEXT: pand %xmm2, %xmm0
-; X32-SSE-NEXT: por %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <2 x i8> %a, %b
- ret <2 x i8> %shift
-}
-
-;
-; Uniform Variable Shifts
-;
-
-define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; SSE2-NEXT: pslld %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT: pslld %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: splatvar_shift_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatvar_shift_v2i32:
-; XOP: # %bb.0:
-; XOP-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; XOP-NEXT: vpslld %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatvar_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX512-NEXT: vpslld %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatvar_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
-; AVX512VL-NEXT: vpslld %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: xorps %xmm2, %xmm2
-; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
-; X32-SSE-NEXT: pslld %xmm2, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <2 x i32> %b, <2 x i32> undef, <2 x i32> zeroinitializer
- %shift = shl <2 x i32> %a, %splat
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v4i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psllw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v4i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: psllw %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: splatvar_shift_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatvar_shift_v4i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatvar_shift_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatvar_shift_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v4i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psllw %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <4 x i16> %b, <4 x i16> undef, <4 x i32> zeroinitializer
- %shift = shl <4 x i16> %a, %splat
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psllw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: psllw %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: splatvar_shift_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatvar_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; XOP-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatvar_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatvar_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512VL-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psllw %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> zeroinitializer
- %shift = shl <2 x i16> %a, %splat
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psllw %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: psllw %xmm1, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psllw %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: psllw %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pshufb %xmm1, %xmm2
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatvar_shift_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v8i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: splatvar_shift_v8i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512DQ-LABEL: splatvar_shift_v8i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: splatvar_shift_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: splatvar_shift_v8i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: splatvar_shift_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v8i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psllw %xmm1, %xmm0
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT: psllw %xmm1, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <8 x i8> %b, <8 x i8> undef, <8 x i32> zeroinitializer
- %shift = shl <8 x i8> %a, %splat
- ret <8 x i8> %shift
-}
-
-define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psllw %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: psllw %xmm1, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psllw %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: psllw %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pshufb %xmm1, %xmm2
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatvar_shift_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: splatvar_shift_v4i8:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7]
-; XOPAVX1-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: splatvar_shift_v4i8:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; XOPAVX2-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512DQ-LABEL: splatvar_shift_v4i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: splatvar_shift_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: splatvar_shift_v4i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: splatvar_shift_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v4i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psllw %xmm1, %xmm0
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT: psllw %xmm1, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <4 x i8> %b, <4 x i8> undef, <4 x i32> zeroinitializer
- %shift = shl <4 x i8> %a, %splat
- ret <4 x i8> %shift
-}
-
-define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
-; SSE2-LABEL: splatvar_shift_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: psllw %xmm1, %xmm0
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE2-NEXT: psllw %xmm1, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: splatvar_shift_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psllw %xmm1, %xmm0
-; SSE41-NEXT: pcmpeqd %xmm2, %xmm2
-; SSE41-NEXT: psllw %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: pshufb %xmm1, %xmm2
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatvar_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatvar_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpsllw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpsllw %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: splatvar_shift_v2i8:
-; XOP: # %bb.0:
-; XOP-NEXT: insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
-; XOP-NEXT: vpshlb %xmm1, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: splatvar_shift_v2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: splatvar_shift_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: splatvar_shift_v2i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsllvd %zmm1, %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: splatvar_shift_v2i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpbroadcastb %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
-; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: splatvar_shift_v2i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
-; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; X32-SSE-NEXT: psllw %xmm1, %xmm0
-; X32-SSE-NEXT: pcmpeqd %xmm2, %xmm2
-; X32-SSE-NEXT: psllw %xmm1, %xmm2
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,2,3,4,5,6,7]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pand %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %splat = shufflevector <2 x i8> %b, <2 x i8> undef, <2 x i32> zeroinitializer
- %shift = shl <2 x i8> %a, %splat
- ret <2 x i8> %shift
-}
-
-;
-; Constant Shifts
-;
-
-define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
-; SSE2-LABEL: constant_shift_v2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslld $4, %xmm1
-; SSE2-NEXT: pslld $5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pslld $5, %xmm1
-; SSE41-NEXT: pslld $4, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: constant_shift_v2i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpslld $5, %xmm0, %xmm1
-; AVX1-NEXT: vpslld $4, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v2i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; XOPAVX1-LABEL: constant_shift_v2i32:
-; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpshld {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX1-NEXT: retq
-;
-; XOPAVX2-LABEL: constant_shift_v2i32:
-; XOPAVX2: # %bb.0:
-; XOPAVX2-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; XOPAVX2-NEXT: retq
-;
-; AVX512-LABEL: constant_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: constant_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: movdqa %xmm0, %xmm1
-; X32-SSE-NEXT: pslld $4, %xmm1
-; X32-SSE-NEXT: pslld $5, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X32-SSE-NEXT: movdqa %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <2 x i32> %a, <i32 4, i32 5>
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
-; SSE-LABEL: constant_shift_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: constant_shift_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v4i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v4i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v4i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <0,1,2,3,u,u,u,u>
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v4i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v4i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v4i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
-; SSE2-LABEL: constant_shift_v2i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v2i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psllw $3, %xmm1
-; SSE41-NEXT: psllw $2, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: constant_shift_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $3, %xmm0, %xmm1
-; AVX-NEXT: vpsllw $2, %xmm0, %xmm0
-; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlw {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v2i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsllw $3, %xmm0, %xmm1
-; AVX512DQ-NEXT: vpsllw $2, %xmm0, %xmm0
-; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v2i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <2,3,u,u,u,u,u,u>
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v2i16:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpsllw $3, %xmm0, %xmm1
-; AVX512DQVL-NEXT: vpsllw $2, %xmm0, %xmm0
-; AVX512DQVL-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v2i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <2 x i16> %a, <i16 2, i16 3>
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
-; SSE2-LABEL: constant_shift_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: packuswb %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: constant_shift_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v8i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v8i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v8i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: pxor %xmm1, %xmm1
-; X32-SSE-NEXT: packuswb %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
- ret <8 x i8> %shift
-}
-
-define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
-; SSE2-LABEL: constant_shift_v4i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v4i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: packuswb %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: constant_shift_v4i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v4i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v4i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v4i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v4i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v4i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v4i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v4i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: pxor %xmm1, %xmm1
-; X32-SSE-NEXT: packuswb %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
- ret <4 x i8> %shift
-}
-
-define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
-; SSE2-LABEL: constant_shift_v2i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSE41-LABEL: constant_shift_v2i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: packuswb %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: constant_shift_v2i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: constant_shift_v2i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; XOP-LABEL: constant_shift_v2i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512DQ-LABEL: constant_shift_v2i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQ-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
-;
-; AVX512BW-LABEL: constant_shift_v2i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQVL-LABEL: constant_shift_v2i8:
-; AVX512DQVL: # %bb.0:
-; AVX512DQVL-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512DQVL-NEXT: vpsllvd {{.*}}(%rip), %zmm0, %zmm0
-; AVX512DQVL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQVL-NEXT: vzeroupper
-; AVX512DQVL-NEXT: retq
-;
-; AVX512BWVL-LABEL: constant_shift_v2i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-;
-; X32-SSE-LABEL: constant_shift_v2i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; X32-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: pxor %xmm1, %xmm1
-; X32-SSE-NEXT: packuswb %xmm1, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <2 x i8> %a, <i8 2, i8 3>
- ret <2 x i8> %shift
-}
-
-;
-; Uniform Constant Shifts
-;
-
-define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v2i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pslld $5, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpslld $5, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v2i32:
-; XOP: # %bb.0:
-; XOP-NEXT: vpslld $5, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v2i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpslld $5, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v2i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpslld $5, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v2i32:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: pslld $5, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <2 x i32> %a, <i32 5, i32 5>
- ret <2 x i32> %shift
-}
-
-define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v4i16:
-; SSE: # %bb.0:
-; SSE-NEXT: psllw $3, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v4i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v4i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpsllw $3, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v4i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v4i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v4i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $3, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
- ret <4 x i16> %shift
-}
-
-define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v2i16:
-; SSE: # %bb.0:
-; SSE-NEXT: psllw $3, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v2i16:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v2i16:
-; XOP: # %bb.0:
-; XOP-NEXT: vpsllw $3, %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v2i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v2i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v2i16:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $3, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <2 x i16> %a, <i16 3, i16 3>
- ret <2 x i16> %shift
-}
-
-define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v8i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psllw $3, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v8i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v8i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v8i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
- ret <8 x i8> %shift
-}
-
-define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v4i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psllw $3, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v4i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v4i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v4i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v4i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v4i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
- ret <4 x i8> %shift
-}
-
-define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
-; SSE-LABEL: splatconstant_shift_v2i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psllw $3, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: splatconstant_shift_v2i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; XOP-LABEL: splatconstant_shift_v2i8:
-; XOP: # %bb.0:
-; XOP-NEXT: vpshlb {{.*}}(%rip), %xmm0, %xmm0
-; XOP-NEXT: retq
-;
-; AVX512-LABEL: splatconstant_shift_v2i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: retq
-;
-; AVX512VL-LABEL: splatconstant_shift_v2i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpsllw $3, %xmm0, %xmm0
-; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512VL-NEXT: retq
-;
-; X32-SSE-LABEL: splatconstant_shift_v2i8:
-; X32-SSE: # %bb.0:
-; X32-SSE-NEXT: psllw $3, %xmm0
-; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
-; X32-SSE-NEXT: retl
- %shift = shl <2 x i8> %a, <i8 3, i8 3>
- ret <2 x i8> %shift
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-SLOW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2,AVX2-FAST
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ
-
-;
-; add
-;
-
-define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_add_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: paddq %xmm3, %xmm1
-; SSE-NEXT: paddq %xmm2, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_add_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_add_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_add_v4i64_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = add <4 x i64> %a0, %a1
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_add_v8i64_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: paddq %xmm6, %xmm2
-; SSE-NEXT: paddq %xmm7, %xmm3
-; SSE-NEXT: paddq %xmm4, %xmm0
-; SSE-NEXT: paddq %xmm5, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_add_v8i64_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpaddq %ymm3, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpaddq %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_add_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = add <8 x i64> %a0, %a1
- %2 = trunc <8 x i64> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
-; SSE-LABEL: trunc_add_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: paddd %xmm2, %xmm0
-; SSE-NEXT: paddd %xmm3, %xmm1
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_add_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_add_v8i32_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = add <8 x i32> %a0, %a1
- %2 = trunc <8 x i32> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_add_v16i64_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm2
-; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm4
-; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm6
-; SSE-NEXT: paddq {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpaddq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
-; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
-; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_add_v16i64_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpaddq %ymm5, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpaddq %ymm4, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpaddq %ymm7, %ymm3, %ymm3
-; AVX2-FAST-NEXT: vpaddq %ymm6, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_add_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1
-; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = add <16 x i64> %a0, %a1
- %2 = trunc <16 x i64> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
-; SSE-LABEL: trunc_add_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: paddd %xmm4, %xmm0
-; SSE-NEXT: paddd %xmm5, %xmm1
-; SSE-NEXT: paddd %xmm6, %xmm2
-; SSE-NEXT: paddd %xmm7, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_add_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_add_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = add <16 x i32> %a0, %a1
- %2 = trunc <16 x i32> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
-; SSE-LABEL: trunc_add_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: paddw %xmm2, %xmm0
-; SSE-NEXT: paddw %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_add_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_add_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpaddw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = add <16 x i16> %a0, %a1
- %2 = trunc <16 x i16> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
-; SSE-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pslld $16, %xmm2
-; SSE-NEXT: psrad $16, %xmm2
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: packssdw %xmm2, %xmm1
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT: psraw $8, %xmm0
-; SSE-NEXT: paddw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
-; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %2 = sext <8 x i8> %1 to <8 x i32>
- %3 = add <8 x i32> %2, %a1
- %4 = trunc <8 x i32> %3 to <8 x i16>
- ret <8 x i16> %4
-}
-
-;
-; add to constant
-;
-
-define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_add_const_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_const_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_add_const_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_add_const_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_add_const_v4i64_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_add_const_v8i64_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_const_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_add_const_v8i64_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_add_const_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
- %2 = trunc <8 x i64> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
-; SSE-LABEL: trunc_add_const_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: paddw {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_const_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_add_const_v8i32_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %2 = trunc <8 x i32> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_add_const_v16i64_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
- %2 = trunc <16 x i64> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
-; SSE-LABEL: trunc_add_const_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_add_const_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %2 = trunc <16 x i32> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
-; SSE-LABEL: trunc_add_const_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: paddb {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_add_const_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_add_const_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_add_const_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
- %2 = trunc <16 x i16> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-;
-; sub
-;
-
-define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_sub_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: psubq %xmm3, %xmm1
-; SSE-NEXT: psubq %xmm2, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_sub_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_sub_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_sub_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_sub_v4i64_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = sub <4 x i64> %a0, %a1
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_sub_v8i64_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: psubq %xmm6, %xmm2
-; SSE-NEXT: psubq %xmm7, %xmm3
-; SSE-NEXT: psubq %xmm4, %xmm0
-; SSE-NEXT: psubq %xmm5, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_sub_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm3[1,2,3],xmm4[4],xmm3[5,6,7]
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_sub_v8i64_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpsubq %ymm3, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpsubq %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_sub_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = sub <8 x i64> %a0, %a1
- %2 = trunc <8 x i64> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
-; SSE-LABEL: trunc_sub_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: psubd %xmm2, %xmm0
-; SSE-NEXT: psubd %xmm3, %xmm1
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_sub_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_sub_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_sub_v8i32_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = sub <8 x i32> %a0, %a1
- %2 = trunc <8 x i32> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_sub_v16i64_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm2
-; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm4
-; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm6
-; SSE-NEXT: psubq {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_sub_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpsubq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpsubq %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpsubq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
-; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
-; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_sub_v16i64_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpsubq %ymm5, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpsubq %ymm4, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpsubq %ymm7, %ymm3, %ymm3
-; AVX2-FAST-NEXT: vpsubq %ymm6, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_sub_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1
-; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = sub <16 x i64> %a0, %a1
- %2 = trunc <16 x i64> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
-; SSE-LABEL: trunc_sub_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psubd %xmm4, %xmm0
-; SSE-NEXT: psubd %xmm5, %xmm1
-; SSE-NEXT: psubd %xmm6, %xmm2
-; SSE-NEXT: psubd %xmm7, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_sub_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_sub_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_sub_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpsubd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = sub <16 x i32> %a0, %a1
- %2 = trunc <16 x i32> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
-; SSE-LABEL: trunc_sub_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psubw %xmm2, %xmm0
-; SSE-NEXT: psubw %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_sub_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_sub_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_sub_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpsubw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = sub <16 x i16> %a0, %a1
- %2 = trunc <16 x i16> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_ext_sub_v16i16_v16i8(<16 x i8> %x, <16 x i8> %y) {
-; SSE-LABEL: trunc_ext_sub_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psubb %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: trunc_ext_sub_v16i16_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
- %a = zext <16 x i8> %x to <16 x i16>
- %b = zext <16 x i8> %y to <16 x i16>
- %c = sub <16 x i16> %a, %b
- %d = trunc <16 x i16> %c to <16 x i8>
- ret <16 x i8> %d
-}
-
-;
-; sub to constant
-;
-
-define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_sub_const_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: psubd {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_sub_const_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_sub_const_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_sub_const_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_sub_const_v4i64_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpsubd {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_sub_const_v8i64_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_sub_const_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_sub_const_v8i64_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_sub_const_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
- %2 = trunc <8 x i64> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
-; SSE-LABEL: trunc_sub_const_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: psubw {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_sub_const_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_sub_const_v8i32_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %2 = trunc <8 x i32> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_sub_const_v16i64_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
- %2 = trunc <16 x i64> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
-; SSE-LABEL: trunc_sub_const_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_sub_const_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_sub_const_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_sub_const_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %2 = trunc <16 x i32> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
-; SSE-LABEL: trunc_sub_const_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_sub_const_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_sub_const_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_sub_const_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
- %2 = trunc <16 x i16> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_ext_sub_const_rhs_v16i16_v16i8(<16 x i8> %x) {
-; SSE-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: psubb {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: trunc_ext_sub_const_rhs_v16i16_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vpsubb {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: retq
- %a = zext <16 x i8> %x to <16 x i16>
- %b = sub <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
- %c = trunc <16 x i16> %b to <16 x i8>
- ret <16 x i8> %c
-}
-
-define <16 x i8> @trunc_ext_sub_const_lhs_v16i16_v16i8(<16 x i8> %x) {
-; SSE-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; SSE-NEXT: psubb %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: trunc_ext_sub_const_lhs_v16i16_v16i8:
-; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-; AVX-NEXT: vpsubb %xmm0, %xmm1, %xmm0
-; AVX-NEXT: retq
- %a = zext <16 x i8> %x to <16 x i16>
- %b = sub <16 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>, %a
- %c = trunc <16 x i16> %b to <16 x i8>
- ret <16 x i8> %c
-}
-
-;
-; mul
-;
-
-define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_mul_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pmuludq %xmm3, %xmm1
-; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_mul_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_mul_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_mul_v4i64_v4i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = mul <4 x i64> %a0, %a1
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_mul_v8i64_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: pmullw %xmm6, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_mul_v8i64_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpmullw %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_mul_v8i64_v8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512F-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
-; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512BW-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = mul <8 x i64> %a0, %a1
- %2 = trunc <8 x i64> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
-; SSE-LABEL: trunc_mul_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm3, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_mul_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_mul_v8i32_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmulld %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = mul <8 x i32> %a0, %a1
- %2 = trunc <8 x i32> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_mul_v16i64_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2
-; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4
-; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6
-; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255]
-; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpackusdw %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vpand %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm7, %xmm4, %xmm3
-; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpand %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm7, %xmm8, %xmm3
-; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_mul_v16i64_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm7, %xmm8
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm7[0,2],xmm8[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm7
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm7[0,2]
-; AVX2-SLOW-NEXT: vpmulld %xmm8, %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm6, %xmm7
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm7
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm7[0,2]
-; AVX2-SLOW-NEXT: vpmulld %xmm6, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpand %xmm6, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm5, %xmm7
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm5 = xmm5[0,2],xmm7[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm7[0,2]
-; AVX2-SLOW-NEXT: vpmulld %xmm5, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm4, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vpmulld %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand %xmm6, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_mul_v16i64_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm8 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm7, %ymm8, %ymm7
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm8, %ymm3
-; AVX2-FAST-NEXT: vpmulld %xmm7, %xmm3, %xmm3
-; AVX2-FAST-NEXT: vpermd %ymm6, %ymm8, %ymm6
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm8, %ymm2
-; AVX2-FAST-NEXT: vpmulld %xmm6, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpand %xmm6, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm5, %ymm8, %ymm5
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm8, %ymm1
-; AVX2-FAST-NEXT: vpmulld %xmm5, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpermd %ymm4, %ymm8, %ymm4
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm8, %ymm0
-; AVX2-FAST-NEXT: vpmulld %xmm4, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand %xmm6, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_mul_v16i64_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovqd %zmm3, %ymm3
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vpmulld %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpmovqd %zmm2, %ymm2
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmulld %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovqd %zmm3, %ymm3
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vpmulld %ymm3, %ymm1, %ymm1
-; AVX512BW-NEXT: vpmovqd %zmm2, %ymm2
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmulld %ymm2, %ymm0, %ymm0
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullq %zmm3, %zmm1, %zmm1
-; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = mul <16 x i64> %a0, %a1
- %2 = trunc <16 x i64> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
-; SSE-LABEL: trunc_mul_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm8, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm5, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm6, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm7, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm5
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm2
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_mul_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_mul_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = mul <16 x i32> %a0, %a1
- %2 = trunc <16 x i32> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
-; SSE-LABEL: trunc_mul_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pmullw %xmm2, %xmm0
-; SSE-NEXT: pmullw %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_mul_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_mul_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullw %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = mul <16 x i16> %a0, %a1
- %2 = trunc <16 x i16> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
-; SSE-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm3, %xmm3
-; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE-NEXT: pslld $16, %xmm2
-; SSE-NEXT: psrad $16, %xmm2
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: packssdw %xmm2, %xmm1
-; SSE-NEXT: pmullw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %2 = zext <8 x i8> %1 to <8 x i32>
- %3 = mul <8 x i32> %2, %a1
- %4 = trunc <8 x i32> %3 to <8 x i16>
- ret <8 x i16> %4
-}
-
-;
-; mul to constant
-;
-
-define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_mul_const_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_const_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_mul_const_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_mul_const_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_mul_const_v4i64_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_mul_const_v8i64_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_const_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_mul_const_v8i64_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_mul_const_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
- %2 = trunc <8 x i64> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
-; SSE-LABEL: trunc_mul_const_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_const_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_mul_const_v8i32_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %2 = trunc <8 x i32> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_mul_const_v16i64_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0
-; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1
-; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2
-; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3
-; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm4
-; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm5
-; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm6
-; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm3, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3
-; AVX1-NEXT: vpmuludq {{.*}}(%rip), %xmm3, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255]
-; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpand %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vpackusdw %xmm3, %xmm7, %xmm3
-; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm4, %xmm6, %xmm6
-; AVX1-NEXT: vpackusdw %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm5, %xmm3
-; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm4, %xmm8, %xmm3
-; AVX1-NEXT: vpackusdw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm3, %xmm3
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
- %2 = trunc <16 x i64> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
-; SSE-LABEL: trunc_mul_const_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [0,1,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm5, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm5, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [8,9,10,11]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm2
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm5, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [12,13,14,15]
-; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm4, %xmm3
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE-NEXT: pmuludq %xmm5, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_const_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255]
-; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackusdw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_mul_const_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmulld {{.*}}(%rip), %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %2 = trunc <16 x i32> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
-; SSE-LABEL: trunc_mul_const_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pmullw {{.*}}(%rip), %xmm0
-; SSE-NEXT: pmullw {{.*}}(%rip), %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_mul_const_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_mul_const_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_mul_const_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmullw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
- %2 = trunc <16 x i16> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-;
-; and
-;
-
-define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_and_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: andps %xmm3, %xmm1
-; SSE-NEXT: andps %xmm2, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_and_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_and_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_and_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_and_v4i64_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = and <4 x i64> %a0, %a1
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_and_v8i64_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pand %xmm6, %xmm2
-; SSE-NEXT: pand %xmm7, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: pand %xmm5, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_and_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
-; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_and_v8i64_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_and_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = and <8 x i64> %a0, %a1
- %2 = trunc <8 x i64> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
-; SSE-LABEL: trunc_and_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: pand %xmm3, %xmm1
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_and_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_and_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_and_v8i32_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = and <8 x i32> %a0, %a1
- %2 = trunc <8 x i32> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_and_v16i64_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm2
-; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm4
-; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm6
-; SSE-NEXT: pand {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_and_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm8 = [255,255,255,255]
-; AVX1-NEXT: vandps %ymm8, %ymm7, %ymm7
-; AVX1-NEXT: vandps %ymm7, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm7
-; AVX1-NEXT: vpackusdw %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vandps %ymm8, %ymm6, %ymm6
-; AVX1-NEXT: vandps %ymm6, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpackusdw %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vandps %ymm8, %ymm5, %ymm3
-; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm8, %ymm4, %ymm3
-; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_and_v16i64_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpand %ymm5, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpand %ymm4, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpand %ymm7, %ymm3, %ymm3
-; AVX2-FAST-NEXT: vpand %ymm6, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_and_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
-; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = and <16 x i64> %a0, %a1
- %2 = trunc <16 x i64> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
-; SSE-LABEL: trunc_and_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm3, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: pand %xmm2, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm1, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm5, %xmm0
-; SSE-NEXT: packuswb %xmm6, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_and_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
-; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_and_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_and_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = and <16 x i32> %a0, %a1
- %2 = trunc <16 x i32> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
-; SSE-LABEL: trunc_and_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm1, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm3, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_and_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_and_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_and_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpand %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = and <16 x i16> %a0, %a1
- %2 = trunc <16 x i16> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-;
-; and to constant
-;
-
-define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_and_const_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: andps {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_and_const_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_and_const_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_and_const_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_and_const_v4i64_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_and_const_v8i64_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: andpd {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_and_const_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_and_const_v8i64_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_and_const_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
- %2 = trunc <8 x i64> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
-; SSE-LABEL: trunc_and_const_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_and_const_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_and_const_v8i32_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %2 = trunc <8 x i32> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_and_const_v16i64_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
- %2 = trunc <16 x i64> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
-; SSE-LABEL: trunc_and_const_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_and_const_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %2 = trunc <16 x i32> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
-; SSE-LABEL: trunc_and_const_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_and_const_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_and_const_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_and_const_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
- %2 = trunc <16 x i16> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-;
-; xor
-;
-
-define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_xor_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: xorps %xmm3, %xmm1
-; SSE-NEXT: xorps %xmm2, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_xor_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_xor_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_xor_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_xor_v4i64_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = xor <4 x i64> %a0, %a1
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_xor_v8i64_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm7, %xmm3
-; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: pxor %xmm5, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_xor_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_xor_v8i64_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_xor_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = xor <8 x i64> %a0, %a1
- %2 = trunc <8 x i64> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
-; SSE-LABEL: trunc_xor_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_xor_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_xor_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_xor_v8i32_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = xor <8 x i32> %a0, %a1
- %2 = trunc <8 x i32> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_xor_v16i64_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm2
-; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm4
-; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm6
-; SSE-NEXT: pxor {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_xor_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1
-; AVX1-NEXT: vxorps %ymm6, %ymm2, %ymm2
-; AVX1-NEXT: vxorps %ymm7, %ymm3, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_xor_v16i64_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpxor %ymm5, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpxor %ymm4, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpxor %ymm7, %ymm3, %ymm3
-; AVX2-FAST-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_xor_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
-; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = xor <16 x i64> %a0, %a1
- %2 = trunc <16 x i64> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
-; SSE-LABEL: trunc_xor_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm4, %xmm0
-; SSE-NEXT: pxor %xmm5, %xmm1
-; SSE-NEXT: pxor %xmm6, %xmm2
-; SSE-NEXT: pxor %xmm7, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_xor_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_xor_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_xor_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = xor <16 x i32> %a0, %a1
- %2 = trunc <16 x i32> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
-; SSE-LABEL: trunc_xor_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: pxor %xmm2, %xmm0
-; SSE-NEXT: pxor %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_xor_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_xor_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_xor_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpxor %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = xor <16 x i16> %a0, %a1
- %2 = trunc <16 x i16> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-;
-; xor to constant
-;
-
-define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_xor_const_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: xorps {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_xor_const_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_xor_const_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_xor_const_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_xor_const_v4i64_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_xor_const_v8i64_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: xorpd {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_xor_const_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_xor_const_v8i64_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_xor_const_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
- %2 = trunc <8 x i64> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
-; SSE-LABEL: trunc_xor_const_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_xor_const_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_xor_const_v8i32_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %2 = trunc <8 x i32> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_xor_const_v16i64_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
- %2 = trunc <16 x i64> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
-; SSE-LABEL: trunc_xor_const_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_xor_const_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %2 = trunc <16 x i32> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
-; SSE-LABEL: trunc_xor_const_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: pxor {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_xor_const_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_xor_const_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_xor_const_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
- %2 = trunc <16 x i16> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-;
-; or
-;
-
-define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_or_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: orps %xmm3, %xmm1
-; SSE-NEXT: orps %xmm2, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_or_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_or_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_or_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_or_v4i64_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = or <4 x i64> %a0, %a1
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_or_v8i64_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: por %xmm4, %xmm0
-; SSE-NEXT: por %xmm5, %xmm1
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_or_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_or_v8i64_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_or_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = or <8 x i64> %a0, %a1
- %2 = trunc <8 x i64> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
-; SSE-LABEL: trunc_or_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm2, %xmm0
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_or_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_or_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_or_v8i32_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = or <8 x i32> %a0, %a1
- %2 = trunc <8 x i32> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind {
-; SSE-LABEL: trunc_or_v16i64_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
-; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
-; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm2
-; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm3
-; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm4
-; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm5
-; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm6
-; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm7
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_or_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm5, %ymm1, %ymm1
-; AVX1-NEXT: vorps %ymm6, %ymm2, %ymm2
-; AVX1-NEXT: vorps %ymm7, %ymm3, %ymm3
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_or_v16i64_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpor %ymm5, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpor %ymm4, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpor %ymm7, %ymm3, %ymm3
-; AVX2-FAST-NEXT: vpor %ymm6, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_or_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
-; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = or <16 x i64> %a0, %a1
- %2 = trunc <16 x i64> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind {
-; SSE-LABEL: trunc_or_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm4, %xmm0
-; SSE-NEXT: por %xmm5, %xmm1
-; SSE-NEXT: por %xmm6, %xmm2
-; SSE-NEXT: por %xmm7, %xmm3
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_or_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vorps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_or_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_or_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = or <16 x i32> %a0, %a1
- %2 = trunc <16 x i32> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind {
-; SSE-LABEL: trunc_or_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: por %xmm2, %xmm0
-; SSE-NEXT: por %xmm3, %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_or_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_or_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_or_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpor %ymm1, %ymm0, %ymm0
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = or <16 x i16> %a0, %a1
- %2 = trunc <16 x i16> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-;
-; or to constant
-;
-
-define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_or_const_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: orps {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_or_const_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_or_const_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_or_const_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vorps {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_or_const_v4i64_v4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
- %2 = trunc <4 x i64> %1 to <4 x i32>
- ret <4 x i32> %2
-}
-
-define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_or_const_v8i64_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE-NEXT: orpd {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_or_const_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_or_const_v8i64_v8i16:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_or_const_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
- %2 = trunc <8 x i64> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
-; SSE-LABEL: trunc_or_const_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: pslld $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: pslld $16, %xmm0
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: por {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_or_const_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_or_const_v8i32_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %2 = trunc <8 x i32> %1 to <8 x i16>
- ret <8 x i16> %2
-}
-
-define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
-; SSE-LABEL: trunc_or_const_v16i64_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE-NEXT: pand %xmm8, %xmm7
-; SSE-NEXT: pand %xmm8, %xmm6
-; SSE-NEXT: packuswb %xmm7, %xmm6
-; SSE-NEXT: pand %xmm8, %xmm5
-; SSE-NEXT: pand %xmm8, %xmm4
-; SSE-NEXT: packuswb %xmm5, %xmm4
-; SSE-NEXT: packuswb %xmm6, %xmm4
-; SSE-NEXT: pand %xmm8, %xmm3
-; SSE-NEXT: pand %xmm8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm8, %xmm1
-; SSE-NEXT: pand %xmm8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm4, %xmm0
-; SSE-NEXT: por {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm4 = [255,255,255,255]
-; AVX1-NEXT: vandps %ymm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vandps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpackusdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vandps %ymm4, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm4, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm4 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm2, %ymm4, %ymm2
-; AVX2-FAST-NEXT: vpermd %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm2, %ymm2
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb %ymm3, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: vpand %xmm5, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
- %2 = trunc <16 x i64> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
-; SSE-LABEL: trunc_or_const_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE-NEXT: pand %xmm4, %xmm3
-; SSE-NEXT: pand %xmm4, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: pand %xmm4, %xmm1
-; SSE-NEXT: pand %xmm4, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: por {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_or_const_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- %2 = trunc <16 x i32> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
-; SSE-LABEL: trunc_or_const_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE-NEXT: pand %xmm2, %xmm1
-; SSE-NEXT: pand %xmm2, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: por {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_or_const_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_or_const_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_or_const_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
-; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
-; AVX512DQ-NEXT: vzeroupper
-; AVX512DQ-NEXT: retq
- %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
- %2 = trunc <16 x i16> %1 to <16 x i8>
- ret <16 x i8> %2
-}
-
-;
-; complex patterns - often created by vectorizer
-;
-
-define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
-; SSE-LABEL: mul_add_const_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE-NEXT: pmuludq %xmm3, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: paddd {{.*}}(%rip), %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: mul_add_const_v4i64_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT: retq
- %1 = sext <4 x i32> %a0 to <4 x i64>
- %2 = sext <4 x i32> %a1 to <4 x i64>
- %3 = mul <4 x i64> %1, %2
- %4 = add <4 x i64> %3, <i64 -3, i64 -1, i64 1, i64 3>
- %5 = trunc <4 x i64> %4 to <4 x i32>
- ret <4 x i32> %5
-}
-
-define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
-; SSE-LABEL: mul_add_self_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm0
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE-NEXT: pmuludq %xmm3, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: paddd %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: mul_add_self_v4i64_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
- %1 = sext <4 x i32> %a0 to <4 x i64>
- %2 = sext <4 x i32> %a1 to <4 x i64>
- %3 = mul <4 x i64> %1, %2
- %4 = add <4 x i64> %3, %3
- %5 = trunc <4 x i64> %4 to <4 x i32>
- ret <4 x i32> %5
-}
-
-define <4 x i32> @mul_add_multiuse_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind {
-; SSE-LABEL: mul_add_multiuse_v4i64_v4i32:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3]
-; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3]
-; SSE-NEXT: pmuludq %xmm2, %xmm4
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3]
-; SSE-NEXT: pmuludq %xmm3, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2]
-; SSE-NEXT: paddd %xmm4, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: mul_add_multiuse_v4i64_v4i32:
-; AVX: # %bb.0:
-; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm1
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: retq
- %1 = sext <4 x i32> %a0 to <4 x i64>
- %2 = sext <4 x i32> %a1 to <4 x i64>
- %3 = mul <4 x i64> %1, %2
- %4 = add <4 x i64> %1, %3
- %5 = trunc <4 x i64> %4 to <4 x i32>
- ret <4 x i32> %5
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
-
-;
-; PACKUS saturation truncation to vXi32
-;
-
-define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) {
-; SSE2-LABEL: trunc_packus_v4i64_v4i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v4i64_v4i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_packus_v4i64_v4i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483647,2147483647]
-; SSE41-NEXT: movdqa %xmm6, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm6, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
-; SSE41-NEXT: xorpd %xmm1, %xmm1
-; SSE41-NEXT: movapd %xmm4, %xmm2
-; SSE41-NEXT: xorpd %xmm3, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: movapd %xmm5, %xmm4
-; SSE41-NEXT: xorpd %xmm3, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5
-; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm2
-; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpand %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_packus_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
-; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_packus_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
-; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm1
-; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_packus_v4i64_v4i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
-; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_packus_v4i64_v4i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovusqd %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_packus_v4i64_v4i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
-; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_packus_v4i64_v4i32:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovusqd %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp slt <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
- %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
- %3 = icmp sgt <4 x i64> %2, zeroinitializer
- %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> zeroinitializer
- %5 = trunc <4 x i64> %4 to <4 x i32>
- ret <4 x i32> %5
-}
-
-
-define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64> %a0) {
-; SSE2-LABEL: trunc_packus_v8i64_v8i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm6
-; SSE2-NEXT: por %xmm2, %xmm6
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v8i64_v8i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483647,2147483647]
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm6
-; SSSE3-NEXT: por %xmm2, %xmm6
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm6, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm1
-; SSSE3-NEXT: pand %xmm6, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm5, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_packus_v8i64_v8i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: movapd {{.*#+}} xmm7 = [4294967295,4294967295]
-; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
-; SSE41-NEXT: movdqa %xmm5, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm5, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm8
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm5, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm5, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm9
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm5, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm5, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm5, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT: xorpd %xmm2, %xmm2
-; SSE41-NEXT: movapd %xmm7, %xmm1
-; SSE41-NEXT: xorpd %xmm10, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm3
-; SSE41-NEXT: movapd %xmm4, %xmm1
-; SSE41-NEXT: xorpd %xmm10, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm1
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
-; SSE41-NEXT: movapd %xmm9, %xmm3
-; SSE41-NEXT: xorpd %xmm10, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3
-; SSE41-NEXT: movapd %xmm8, %xmm4
-; SSE41-NEXT: xorpd %xmm10, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v8i64_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2
-; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
-; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-FAST-NEXT: vpand %ymm1, %ymm3, %ymm1
-; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
-; AVX2-FAST-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_packus_v8i64_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovusqd %zmm0, %ymm0
-; AVX512-NEXT: retq
- %1 = icmp slt <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
- %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
- %3 = icmp sgt <8 x i64> %2, zeroinitializer
- %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
- %5 = trunc <8 x i64> %4 to <8 x i32>
- ret <8 x i32> %5
-}
-
-;
-; PACKUS saturation truncation to vXi16
-;
-
-define <8 x i16> @trunc_packus_v8i64_v8i16(<8 x i64> %a0) {
-; SSE2-LABEL: trunc_packus_v8i64_v8i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm6, %xmm2
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pxor %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v8i64_v8i16:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm1, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147549183,2147549183]
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm1, %xmm5
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm6
-; SSSE3-NEXT: por %xmm3, %xmm6
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm9, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: movdqa %xmm6, %xmm2
-; SSSE3-NEXT: pxor %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pxor %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pand %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_packus_v8i64_v8i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: movapd {{.*#+}} xmm7 = [65535,65535]
-; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183]
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: movapd %xmm7, %xmm1
-; SSE41-NEXT: xorpd %xmm10, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4
-; SSE41-NEXT: movapd %xmm6, %xmm1
-; SSE41-NEXT: xorpd %xmm10, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
-; SSE41-NEXT: packusdw %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: xorpd %xmm10, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE41-NEXT: movapd %xmm8, %xmm2
-; SSE41-NEXT: xorpd %xmm10, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
-; SSE41-NEXT: packusdw %xmm4, %xmm3
-; SSE41-NEXT: packusdw %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2
-; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_packus_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpand %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_packus_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovusqw %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp slt <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
- %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
- %3 = icmp sgt <8 x i64> %2, zeroinitializer
- %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
- %5 = trunc <8 x i64> %4 to <8 x i16>
- ret <8 x i16> %5
-}
-
-define <8 x i16> @trunc_packus_v8i32_v8i16(<8 x i32> %a0) {
-; SSE2-LABEL: trunc_packus_v8i32_v8i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v8i32_v8i16:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm1
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_packus_v8i32_v8i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_packus_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_packus_v8i32_v8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_packus_v8i32_v8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_packus_v8i32_v8i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp slt <8 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
- %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
- %3 = icmp sgt <8 x i32> %2, zeroinitializer
- %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
- %5 = trunc <8 x i32> %4 to <8 x i16>
- ret <8 x i16> %5
-}
-
-define <16 x i16> @trunc_packus_v16i32_v16i16(<16 x i32> %a0) {
-; SSE2-LABEL: trunc_packus_v16i32_v16i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
-; SSE2-NEXT: movdqa %xmm6, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm6, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm6, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm6, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm6, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm6, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm6, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm6, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm5
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pslld $16, %xmm5
-; SSE2-NEXT: psrad $16, %xmm5
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm5, %xmm0
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm3, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v16i32_v16i16:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [65535,65535,65535,65535]
-; SSSE3-NEXT: movdqa %xmm6, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm6, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm6, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pandn %xmm6, %xmm5
-; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: movdqa %xmm6, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm6, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: movdqa %xmm6, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: pandn %xmm6, %xmm3
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm5, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm0
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm5
-; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: pslld $16, %xmm5
-; SSSE3-NEXT: psrad $16, %xmm5
-; SSSE3-NEXT: pslld $16, %xmm0
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: packssdw %xmm5, %xmm0
-; SSSE3-NEXT: pslld $16, %xmm3
-; SSSE3-NEXT: psrad $16, %xmm3
-; SSSE3-NEXT: pslld $16, %xmm1
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: packssdw %xmm3, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_packus_v16i32_v16i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v16i32_v16i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_packus_v16i32_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_packus_v16i32_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovusdw %zmm0, %ymm0
-; AVX512-NEXT: retq
- %1 = icmp slt <16 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
- %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
- %3 = icmp sgt <16 x i32> %2, zeroinitializer
- %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
- %5 = trunc <16 x i32> %4 to <16 x i16>
- ret <16 x i16> %5
-}
-
-;
-; PACKUS saturation truncation to v16i8
-;
-
-define <8 x i8> @trunc_packus_v8i64_v8i8(<8 x i64> %a0) {
-; SSE2-LABEL: trunc_packus_v8i64_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm9, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v8i64_v8i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm9, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: packuswb %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_packus_v8i64_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: movapd {{.*#+}} xmm7 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903]
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm6
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: movapd %xmm7, %xmm1
-; SSE41-NEXT: xorpd %xmm10, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm4
-; SSE41-NEXT: movapd %xmm6, %xmm1
-; SSE41-NEXT: xorpd %xmm10, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
-; SSE41-NEXT: packusdw %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: xorpd %xmm10, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE41-NEXT: movapd %xmm8, %xmm2
-; SSE41-NEXT: xorpd %xmm10, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
-; SSE41-NEXT: packusdw %xmm4, %xmm3
-; SSE41-NEXT: packusdw %xmm3, %xmm1
-; SSE41-NEXT: packuswb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v8i64_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2
-; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_packus_v8i64_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_packus_v8i64_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %3 = icmp sgt <8 x i64> %2, zeroinitializer
- %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
- %5 = trunc <8 x i64> %4 to <8 x i8>
- ret <8 x i8> %5
-}
-
-define void @trunc_packus_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
-; SSE2-LABEL: trunc_packus_v8i64_v8i8_store:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm10, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm9, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm10, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm10, %xmm3
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm10, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v8i64_v8i8_store:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm10, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm9, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm10, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm10, %xmm3
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm10, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm10, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: packuswb %xmm0, %xmm0
-; SSSE3-NEXT: movq %xmm0, (%rdi)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_packus_v8i64_v8i8_store:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: movapd {{.*#+}} xmm7 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483903,2147483903]
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: movapd %xmm7, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm10, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm7
-; SSE41-NEXT: xorpd %xmm1, %xmm1
-; SSE41-NEXT: movapd %xmm7, %xmm4
-; SSE41-NEXT: xorpd %xmm10, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm5
-; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: xorpd %xmm10, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm4
-; SSE41-NEXT: packusdw %xmm5, %xmm4
-; SSE41-NEXT: movapd %xmm2, %xmm3
-; SSE41-NEXT: xorpd %xmm10, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: movapd %xmm8, %xmm2
-; SSE41-NEXT: xorpd %xmm10, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm10, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm10, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
-; SSE41-NEXT: packusdw %xmm3, %xmm1
-; SSE41-NEXT: packusdw %xmm1, %xmm4
-; SSE41-NEXT: packuswb %xmm0, %xmm4
-; SSE41-NEXT: movq %xmm4, (%rdi)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v8i64_v8i8_store:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
-; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpand %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpand %xmm6, %xmm7, %xmm2
-; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, (%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_packus_v8i64_v8i8_store:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [255,255,255,255]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vpand %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vmovq %xmm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_packus_v8i64_v8i8_store:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovusqb %zmm0, (%rdi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp slt <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %3 = icmp sgt <8 x i64> %2, zeroinitializer
- %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
- %5 = trunc <8 x i64> %4 to <8 x i8>
- store <8 x i8> %5, <8 x i8> *%p1
- ret void
-}
-
-define <16 x i8> @trunc_packus_v16i64_v16i8(<16 x i64> %a0) {
-; SSE2-LABEL: trunc_packus_v16i64_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm6, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm11, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm14
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
-; SSE2-NEXT: por %xmm14, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm6
-; SSE2-NEXT: pandn %xmm10, %xmm9
-; SSE2-NEXT: por %xmm6, %xmm9
-; SSE2-NEXT: movdqa %xmm7, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm11, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm12
-; SSE2-NEXT: pand %xmm12, %xmm7
-; SSE2-NEXT: pandn %xmm10, %xmm12
-; SSE2-NEXT: por %xmm7, %xmm12
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm11, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm13
-; SSE2-NEXT: pand %xmm13, %xmm4
-; SSE2-NEXT: pandn %xmm10, %xmm13
-; SSE2-NEXT: por %xmm4, %xmm13
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm11, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm14
-; SSE2-NEXT: pand %xmm14, %xmm5
-; SSE2-NEXT: pandn %xmm10, %xmm14
-; SSE2-NEXT: por %xmm5, %xmm14
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm11, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm10, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm11, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm10, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm8, %xmm3
-; SSE2-NEXT: movdqa %xmm11, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm10, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm11, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm10, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm14, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm14, %xmm2
-; SSE2-NEXT: movdqa %xmm13, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm13, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm12, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm12, %xmm3
-; SSE2-NEXT: movdqa %xmm9, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm8, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm9, %xmm4
-; SSE2-NEXT: packuswb %xmm3, %xmm4
-; SSE2-NEXT: packuswb %xmm4, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_packus_v16i64_v16i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm6, %xmm9
-; SSSE3-NEXT: pxor %xmm8, %xmm9
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm11, %xmm12
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm12
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm9
-; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm14
-; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
-; SSSE3-NEXT: por %xmm14, %xmm9
-; SSSE3-NEXT: pand %xmm9, %xmm6
-; SSSE3-NEXT: pandn %xmm10, %xmm9
-; SSSE3-NEXT: por %xmm6, %xmm9
-; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pxor %xmm8, %xmm6
-; SSSE3-NEXT: movdqa %xmm11, %xmm12
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm12
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm12
-; SSSE3-NEXT: pand %xmm12, %xmm7
-; SSSE3-NEXT: pandn %xmm10, %xmm12
-; SSSE3-NEXT: por %xmm7, %xmm12
-; SSSE3-NEXT: movdqa %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm8, %xmm6
-; SSSE3-NEXT: movdqa %xmm11, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm13
-; SSSE3-NEXT: pand %xmm13, %xmm4
-; SSSE3-NEXT: pandn %xmm10, %xmm13
-; SSSE3-NEXT: por %xmm4, %xmm13
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: movdqa %xmm11, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm14
-; SSSE3-NEXT: pand %xmm14, %xmm5
-; SSSE3-NEXT: pandn %xmm10, %xmm14
-; SSSE3-NEXT: por %xmm5, %xmm14
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: movdqa %xmm11, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm10, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm8, %xmm2
-; SSSE3-NEXT: movdqa %xmm11, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm10, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm8, %xmm3
-; SSSE3-NEXT: movdqa %xmm11, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm10, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm8, %xmm0
-; SSSE3-NEXT: movdqa %xmm11, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm10, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm8, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm8, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm8, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm8, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm14, %xmm1
-; SSSE3-NEXT: pxor %xmm8, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm14, %xmm2
-; SSSE3-NEXT: movdqa %xmm13, %xmm1
-; SSSE3-NEXT: pxor %xmm8, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: pand %xmm13, %xmm1
-; SSSE3-NEXT: packuswb %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm12, %xmm2
-; SSSE3-NEXT: pxor %xmm8, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm12, %xmm3
-; SSSE3-NEXT: movdqa %xmm9, %xmm2
-; SSSE3-NEXT: pxor %xmm8, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm8, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm8, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: pand %xmm9, %xmm4
-; SSSE3-NEXT: packuswb %xmm3, %xmm4
-; SSSE3-NEXT: packuswb %xmm4, %xmm1
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_packus_v16i64_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movapd {{.*#+}} xmm11 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm12 = [2147483903,2147483903]
-; SSE41-NEXT: movdqa %xmm12, %xmm10
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: movdqa %xmm12, %xmm13
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: por %xmm13, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm10
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm10
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm13
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm13
-; SSE41-NEXT: movdqa %xmm12, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm13, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm13
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm13
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: movdqa %xmm12, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm14
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm14
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm12, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm15
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm15
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm12, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm12, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm12, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm12
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm12, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: movapd %xmm11, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm7
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm7
-; SSE41-NEXT: movapd %xmm3, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm1, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: packusdw %xmm7, %xmm1
-; SSE41-NEXT: movapd %xmm6, %xmm3
-; SSE41-NEXT: xorpd %xmm9, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3
-; SSE41-NEXT: movapd %xmm5, %xmm4
-; SSE41-NEXT: xorpd %xmm9, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
-; SSE41-NEXT: packusdw %xmm3, %xmm4
-; SSE41-NEXT: packusdw %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm15, %xmm3
-; SSE41-NEXT: xorpd %xmm9, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm4
-; SSE41-NEXT: movapd %xmm14, %xmm3
-; SSE41-NEXT: xorpd %xmm9, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm3
-; SSE41-NEXT: packusdw %xmm4, %xmm3
-; SSE41-NEXT: movapd %xmm13, %xmm4
-; SSE41-NEXT: xorpd %xmm9, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm4
-; SSE41-NEXT: movapd %xmm10, %xmm5
-; SSE41-NEXT: xorpd %xmm9, %xmm5
-; SSE41-NEXT: movapd %xmm5, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm9, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2
-; SSE41-NEXT: packusdw %xmm4, %xmm2
-; SSE41-NEXT: packusdw %xmm2, %xmm3
-; SSE41-NEXT: packuswb %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [255,255]
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm5, %xmm10
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm11
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm7, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm9, %xmm5, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm9, %xmm5, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm0
-; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm5, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm8, %xmm5, %xmm3
-; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm4
-; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm3
-; AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm4
-; AVX1-NEXT: vpand %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm2
-; AVX1-NEXT: vpand %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm3
-; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm11, %xmm2
-; AVX1-NEXT: vpand %xmm11, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm10, %xmm3
-; AVX1-NEXT: vpand %xmm10, %xmm3, %xmm3
-; AVX1-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_packus_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [255,255,255,255]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5
-; AVX2-NEXT: vpand %ymm1, %ymm5, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5
-; AVX2-NEXT: vpand %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1
-; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3
-; AVX2-NEXT: vpand %ymm2, %ymm3, %ymm2
-; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_packus_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
-; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp slt <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %3 = icmp sgt <16 x i64> %2, zeroinitializer
- %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> zeroinitializer
- %5 = trunc <16 x i64> %4 to <16 x i8>
- ret <16 x i8> %5
-}
-
-define <8 x i8> @trunc_packus_v8i32_v8i8(<8 x i32> %a0) {
-; SSE-LABEL: trunc_packus_v8i32_v8i8:
-; SSE: # %bb.0:
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v8i32_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_packus_v8i32_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_packus_v8i32_v8i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_packus_v8i32_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_packus_v8i32_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %3 = icmp sgt <8 x i32> %2, zeroinitializer
- %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
- %5 = trunc <8 x i32> %4 to <8 x i8>
- ret <8 x i8> %5
-}
-
-define void @trunc_packus_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
-; SSE-LABEL: trunc_packus_v8i32_v8i8_store:
-; SSE: # %bb.0:
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm0, %xmm0
-; SSE-NEXT: movq %xmm0, (%rdi)
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v8i32_v8i8_store:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, (%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_packus_v8i32_v8i8_store:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_packus_v8i32_v8i8_store:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, (%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_packus_v8i32_v8i8_store:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_packus_v8i32_v8i8_store:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, (%rdi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_packus_v8i32_v8i8_store:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp slt <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %3 = icmp sgt <8 x i32> %2, zeroinitializer
- %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
- %5 = trunc <8 x i32> %4 to <8 x i8>
- store <8 x i8> %5, <8 x i8> *%p1
- ret void
-}
-
-define <16 x i8> @trunc_packus_v16i32_v16i8(<16 x i32> %a0) {
-; SSE-LABEL: trunc_packus_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: packssdw %xmm3, %xmm2
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_packus_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_packus_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovusdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp slt <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %3 = icmp sgt <16 x i32> %2, zeroinitializer
- %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
- %5 = trunc <16 x i32> %4 to <16 x i8>
- ret <16 x i8> %5
-}
-
-define <16 x i8> @trunc_packus_v16i16_v16i8(<16 x i16> %a0) {
-; SSE-LABEL: trunc_packus_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_packus_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_packus_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_packus_v16i16_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_packus_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_packus_v16i16_v16i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp slt <16 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %3 = icmp sgt <16 x i16> %2, zeroinitializer
- %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
- %5 = trunc <16 x i16> %4 to <16 x i8>
- ret <16 x i8> %5
-}
-
-define <32 x i8> @trunc_packus_v32i16_v32i8(<32 x i16> %a0) {
-; SSE-LABEL: trunc_packus_v32i16_v32i8:
-; SSE: # %bb.0:
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_packus_v32i16_v32i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_packus_v32i16_v32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_packus_v32i16_v32i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpminsw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_packus_v32i16_v32i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512VL-NEXT: vpminsw %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpminsw %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_packus_v32i16_v32i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_packus_v32i16_v32i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0
-; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0
-; AVX512BWVL-NEXT: retq
- %1 = icmp slt <32 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %3 = icmp sgt <32 x i16> %2, zeroinitializer
- %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
- %5 = trunc <32 x i16> %4 to <32 x i8>
- ret <32 x i8> %5
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
-
-;
-; Signed saturation truncation to vXi32
-;
-
-define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
-; SSE2-LABEL: trunc_ssat_v4i64_v4i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320]
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pandn %xmm1, %xmm6
-; SSE2-NEXT: por %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm5, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm1, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_ssat_v4i64_v4i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [4294967295,4294967295]
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [18446744069414584320,18446744069414584320]
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pandn %xmm1, %xmm6
-; SSSE3-NEXT: por %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm5, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm1, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_ssat_v4i64_v4i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movapd {{.*#+}} xmm4 = [2147483647,2147483647]
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
-; SSE41-NEXT: movdqa %xmm6, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm3, %xmm0
-; SSE41-NEXT: movdqa %xmm6, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm4
-; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; SSE41-NEXT: movapd %xmm4, %xmm2
-; SSE41-NEXT: xorpd %xmm3, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320]
-; SSE41-NEXT: movapd %xmm2, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: xorpd %xmm5, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm6, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; SSE41-NEXT: movaps %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [2147483647,2147483647]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5
-; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_ssat_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
-; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_ssat_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
-; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-FAST-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_ssat_v4i64_v4i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
-; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_ssat_v4i64_v4i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovsqd %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_ssat_v4i64_v4i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
-; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_ssat_v4i64_v4i32:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovsqd %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp slt <4 x i64> %a0, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
- %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
- %3 = icmp sgt <4 x i64> %2, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
- %4 = select <4 x i1> %3, <4 x i64> %2, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
- %5 = trunc <4 x i64> %4 to <4 x i32>
- ret <4 x i32> %5
-}
-
-
-define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64> %a0) {
-; SSE2-LABEL: trunc_ssat_v8i64_v8i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa %xmm9, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm7
-; SSE2-NEXT: por %xmm2, %xmm7
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968]
-; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm6
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm7, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm7
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_ssat_v8i64_v8i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483647,2147483647]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [4294967295,4294967295]
-; SSSE3-NEXT: movdqa %xmm9, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm7
-; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm7
-; SSSE3-NEXT: por %xmm2, %xmm7
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm6
-; SSSE3-NEXT: por %xmm3, %xmm6
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067968,18446744071562067968]
-; SSSE3-NEXT: movdqa %xmm6, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744069414584320,18446744069414584320]
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm6
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm7, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm7
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm7, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm5, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_ssat_v8i64_v8i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm9
-; SSE41-NEXT: movapd {{.*#+}} xmm10 = [2147483647,2147483647]
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm10, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm8
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm10, %xmm9
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm9
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm10, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm6
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm10
-; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
-; SSE41-NEXT: movapd %xmm10, %xmm1
-; SSE41-NEXT: xorpd %xmm5, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm3 = [18446744069414584320,18446744069414584320]
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm4
-; SSE41-NEXT: movapd %xmm6, %xmm1
-; SSE41-NEXT: xorpd %xmm5, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm1
-; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2]
-; SSE41-NEXT: movapd %xmm9, %xmm4
-; SSE41-NEXT: xorpd %xmm5, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm4
-; SSE41-NEXT: xorpd %xmm8, %xmm5
-; SSE41-NEXT: movapd %xmm5, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm2
-; SSE41-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2]
-; SSE41-NEXT: movaps %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v8i64_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [2147483647,2147483647]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_ssat_v8i64_v8i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647]
-; AVX2-FAST-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_ssat_v8i64_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsqd %zmm0, %ymm0
-; AVX512-NEXT: retq
- %1 = icmp slt <8 x i64> %a0, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
- %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
- %3 = icmp sgt <8 x i64> %2, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
- %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648>
- %5 = trunc <8 x i64> %4 to <8 x i32>
- ret <8 x i32> %5
-}
-
-;
-; Signed saturation truncation to vXi16
-;
-
-define <8 x i16> @trunc_ssat_v8i64_v8i16(<8 x i64> %a0) {
-; SSE2-LABEL: trunc_ssat_v8i64_v8i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147516415,2147516415]
-; SSE2-NEXT: movdqa %xmm9, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm7
-; SSE2-NEXT: por %xmm1, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848]
-; SSE2-NEXT: movdqa %xmm7, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm7
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: packssdw %xmm3, %xmm1
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_ssat_v8i64_v8i16:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [32767,32767]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147516415,2147516415]
-; SSSE3-NEXT: movdqa %xmm9, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm7
-; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm7
-; SSSE3-NEXT: por %xmm1, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709518848,18446744073709518848]
-; SSSE3-NEXT: movdqa %xmm7, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562035200,18446744071562035200]
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm7
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm7, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: packssdw %xmm3, %xmm1
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_ssat_v8i64_v8i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm10
-; SSE41-NEXT: movapd {{.*#+}} xmm11 = [32767,32767]
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147516415,2147516415]
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm8
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm9
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9
-; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11
-; SSE41-NEXT: movapd {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
-; SSE41-NEXT: movapd %xmm11, %xmm1
-; SSE41-NEXT: xorpd %xmm5, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562035200,18446744071562035200]
-; SSE41-NEXT: movapd %xmm1, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm6
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: xorpd %xmm5, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: packssdw %xmm6, %xmm1
-; SSE41-NEXT: movapd %xmm9, %xmm2
-; SSE41-NEXT: xorpd %xmm5, %xmm2
-; SSE41-NEXT: movapd %xmm2, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: movapd %xmm3, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2
-; SSE41-NEXT: xorpd %xmm8, %xmm5
-; SSE41-NEXT: movapd %xmm5, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm3
-; SSE41-NEXT: packssdw %xmm2, %xmm3
-; SSE41-NEXT: packssdw %xmm3, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32767,32767]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm9
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm3, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm7
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm8, %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ssat_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [32767,32767,32767,32767]
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_ssat_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsqw %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp slt <8 x i64> %a0, <i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767>
- %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767, i64 32767>
- %3 = icmp sgt <8 x i64> %2, <i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768>
- %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> <i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768, i64 -32768>
- %5 = trunc <8 x i64> %4 to <8 x i16>
- ret <8 x i16> %5
-}
-
-define <8 x i16> @trunc_ssat_v8i32_v8i16(<8 x i32> %a0) {
-; SSE-LABEL: trunc_ssat_v8i32_v8i16:
-; SSE: # %bb.0:
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ssat_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_ssat_v8i32_v8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
-; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528]
-; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_ssat_v8i32_v8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovsdw %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_ssat_v8i32_v8i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [32767,32767,32767,32767,32767,32767,32767,32767]
-; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528,4294934528]
-; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovsdw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp slt <8 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
- %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
- %3 = icmp sgt <8 x i32> %2, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
- %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
- %5 = trunc <8 x i32> %4 to <8 x i16>
- ret <8 x i16> %5
-}
-
-define <16 x i16> @trunc_ssat_v16i32_v16i16(<16 x i32> %a0) {
-; SSE-LABEL: trunc_ssat_v16i32_v16i16:
-; SSE: # %bb.0:
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packssdw %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v16i32_v16i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ssat_v16i32_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_ssat_v16i32_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsdw %zmm0, %ymm0
-; AVX512-NEXT: retq
- %1 = icmp slt <16 x i32> %a0, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
- %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
- %3 = icmp sgt <16 x i32> %2, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
- %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768>
- %5 = trunc <16 x i32> %4 to <16 x i16>
- ret <16 x i16> %5
-}
-
-;
-; Signed saturation truncation to v16i8
-;
-
-define <8 x i8> @trunc_ssat_v8i64_v8i8(<8 x i64> %a0) {
-; SSE2-LABEL: trunc_ssat_v8i64_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
-; SSE2-NEXT: movdqa %xmm9, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm7
-; SSE2-NEXT: por %xmm0, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSE2-NEXT: movdqa %xmm7, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm7
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm7
-; SSE2-NEXT: por %xmm2, %xmm7
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: packuswb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm7, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_ssat_v8i64_v8i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
-; SSSE3-NEXT: movdqa %xmm9, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm7
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm7
-; SSSE3-NEXT: por %xmm0, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSSE3-NEXT: movdqa %xmm7, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm7
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm7
-; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm7
-; SSSE3-NEXT: por %xmm2, %xmm7
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: packuswb %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm3, %xmm7
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: packuswb %xmm7, %xmm0
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: packuswb %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_ssat_v8i64_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movapd {{.*#+}} xmm11 = [127,127]
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775]
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm6, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm9
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm6, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm6, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm10
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm10
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm6, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm6, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm6, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11
-; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488]
-; SSE41-NEXT: movapd %xmm11, %xmm1
-; SSE41-NEXT: xorpd %xmm5, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840]
-; SSE41-NEXT: movapd %xmm1, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm1
-; SSE41-NEXT: movapd %xmm3, %xmm6
-; SSE41-NEXT: xorpd %xmm5, %xmm6
-; SSE41-NEXT: movapd %xmm6, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm7
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT: movapd %xmm10, %xmm3
-; SSE41-NEXT: xorpd %xmm5, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
-; SSE41-NEXT: xorpd %xmm9, %xmm5
-; SSE41-NEXT: movapd %xmm5, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm2
-; SSE41-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE41-NEXT: andpd %xmm0, %xmm2
-; SSE41-NEXT: andpd %xmm0, %xmm3
-; SSE41-NEXT: packusdw %xmm2, %xmm3
-; SSE41-NEXT: andpd %xmm0, %xmm7
-; SSE41-NEXT: andpd %xmm0, %xmm1
-; SSE41-NEXT: packusdw %xmm7, %xmm1
-; SSE41-NEXT: packusdw %xmm3, %xmm1
-; SSE41-NEXT: packuswb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v8i64_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [127,127,127,127]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7
-; AVX1-NEXT: vblendvpd %ymm7, %ymm1, %ymm8, %ymm9
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm10
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm11
-; AVX1-NEXT: vblendvpd %ymm11, %ymm0, %ymm8, %ymm8
-; AVX1-NEXT: vmovapd {{.*#+}} ymm11 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm11, %ymm0
-; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vblendvpd %ymm1, %ymm9, %ymm11, %ymm1
-; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255]
-; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ssat_v8i64_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_ssat_v8i64_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp slt <8 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
- %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
- %3 = icmp sgt <8 x i64> %2, <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
- %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
- %5 = trunc <8 x i64> %4 to <8 x i8>
- ret <8 x i8> %5
-}
-
-; TODO: The AVX1 codegen shows a missed opportunity to narrow blendv+logic to 128-bit.
-
-define void @trunc_ssat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
-; SSE2-LABEL: trunc_ssat_v8i64_v8i8_store:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
-; SSE2-NEXT: movdqa %xmm9, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm7
-; SSE2-NEXT: por %xmm0, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSE2-NEXT: movdqa %xmm7, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm7
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm7
-; SSE2-NEXT: por %xmm2, %xmm7
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm4
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: packuswb %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: packuswb %xmm7, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movq %xmm0, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_ssat_v8i64_v8i8_store:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [127,127]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm4, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [2147483775,2147483775]
-; SSSE3-NEXT: movdqa %xmm9, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm7
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm7
-; SSSE3-NEXT: por %xmm0, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [18446744073709551488,18446744073709551488]
-; SSSE3-NEXT: movdqa %xmm7, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [18446744071562067840,18446744071562067840]
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm7
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm7
-; SSSE3-NEXT: pand %xmm7, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm7
-; SSSE3-NEXT: por %xmm2, %xmm7
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm5, %xmm4
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: packuswb %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm3, %xmm7
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: packuswb %xmm7, %xmm0
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: packuswb %xmm0, %xmm0
-; SSSE3-NEXT: movq %xmm0, (%rdi)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_ssat_v8i64_v8i8_store:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movapd {{.*#+}} xmm11 = [127,127]
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [2147483775,2147483775]
-; SSE41-NEXT: movdqa %xmm6, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm7
-; SSE41-NEXT: movdqa %xmm6, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm9
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm9
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm6, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm6, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm10
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm10
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm6, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm6, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm3
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm6, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm11
-; SSE41-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
-; SSE41-NEXT: movapd %xmm11, %xmm2
-; SSE41-NEXT: xorpd %xmm5, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [18446744071562067840,18446744071562067840]
-; SSE41-NEXT: movapd %xmm2, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm2
-; SSE41-NEXT: movapd %xmm3, %xmm6
-; SSE41-NEXT: xorpd %xmm5, %xmm6
-; SSE41-NEXT: movapd %xmm6, %xmm7
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm7, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm7
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm7
-; SSE41-NEXT: movapd %xmm10, %xmm3
-; SSE41-NEXT: xorpd %xmm5, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm3
-; SSE41-NEXT: xorpd %xmm9, %xmm5
-; SSE41-NEXT: movapd %xmm5, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm5, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm9, %xmm1
-; SSE41-NEXT: movapd {{.*#+}} xmm0 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE41-NEXT: andpd %xmm0, %xmm1
-; SSE41-NEXT: andpd %xmm0, %xmm3
-; SSE41-NEXT: packusdw %xmm1, %xmm3
-; SSE41-NEXT: andpd %xmm0, %xmm7
-; SSE41-NEXT: andpd %xmm0, %xmm2
-; SSE41-NEXT: packusdw %xmm7, %xmm2
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: packuswb %xmm0, %xmm2
-; SSE41-NEXT: movq %xmm2, (%rdi)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v8i64_v8i8_store:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovapd {{.*#+}} ymm8 = [127,127,127,127]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [127,127]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm7
-; AVX1-NEXT: vblendvpd %ymm7, %ymm1, %ymm8, %ymm9
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm10
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm10, %ymm11
-; AVX1-NEXT: vblendvpd %ymm11, %ymm0, %ymm8, %ymm8
-; AVX1-NEXT: vmovapd {{.*#+}} ymm11 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vblendvpd %ymm0, %ymm8, %ymm11, %ymm0
-; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vblendvpd %ymm1, %ymm9, %ymm11, %ymm1
-; AVX1-NEXT: vmovapd {{.*#+}} ymm2 = [255,255,255,255]
-; AVX1-NEXT: vandpd %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandpd %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, (%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ssat_v8i64_v8i8_store:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [127,127,127,127]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vmovq %xmm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_ssat_v8i64_v8i8_store:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsqb %zmm0, (%rdi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp slt <8 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
- %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
- %3 = icmp sgt <8 x i64> %2, <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
- %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
- %5 = trunc <8 x i64> %4 to <8 x i8>
- store <8 x i8> %5, <8 x i8> *%p1
- ret void
-}
-
-define <16 x i8> @trunc_ssat_v16i64_v16i8(<16 x i64> %a0) {
-; SSE2-LABEL: trunc_ssat_v16i64_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [127,127]
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm6, %xmm9
-; SSE2-NEXT: pxor %xmm8, %xmm9
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775]
-; SSE2-NEXT: movdqa %xmm11, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm14
-; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
-; SSE2-NEXT: por %xmm14, %xmm9
-; SSE2-NEXT: pand %xmm9, %xmm6
-; SSE2-NEXT: pandn %xmm10, %xmm9
-; SSE2-NEXT: por %xmm6, %xmm9
-; SSE2-NEXT: movdqa %xmm7, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm11, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm12
-; SSE2-NEXT: pand %xmm12, %xmm7
-; SSE2-NEXT: pandn %xmm10, %xmm12
-; SSE2-NEXT: por %xmm7, %xmm12
-; SSE2-NEXT: movdqa %xmm4, %xmm6
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: movdqa %xmm11, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm13
-; SSE2-NEXT: pand %xmm13, %xmm4
-; SSE2-NEXT: pandn %xmm10, %xmm13
-; SSE2-NEXT: por %xmm4, %xmm13
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm11, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm14
-; SSE2-NEXT: pand %xmm14, %xmm5
-; SSE2-NEXT: pandn %xmm10, %xmm14
-; SSE2-NEXT: por %xmm5, %xmm14
-; SSE2-NEXT: movdqa %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm8, %xmm4
-; SSE2-NEXT: movdqa %xmm11, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm10, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm11, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pandn %xmm10, %xmm6
-; SSE2-NEXT: por %xmm3, %xmm6
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pxor %xmm8, %xmm2
-; SSE2-NEXT: movdqa %xmm11, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm0
-; SSE2-NEXT: pandn %xmm10, %xmm3
-; SSE2-NEXT: por %xmm0, %xmm3
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm11, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm10, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [18446744073709551488,18446744073709551488]
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm11 = [18446744071562067840,18446744071562067840]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm10, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm3
-; SSE2-NEXT: pandn %xmm10, %xmm0
-; SSE2-NEXT: por %xmm3, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm6, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm6
-; SSE2-NEXT: pandn %xmm10, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pandn %xmm10, %xmm3
-; SSE2-NEXT: por %xmm5, %xmm3
-; SSE2-NEXT: packssdw %xmm2, %xmm3
-; SSE2-NEXT: packssdw %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm14, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm14
-; SSE2-NEXT: pandn %xmm10, %xmm2
-; SSE2-NEXT: por %xmm14, %xmm2
-; SSE2-NEXT: movdqa %xmm13, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm13
-; SSE2-NEXT: pandn %xmm10, %xmm3
-; SSE2-NEXT: por %xmm13, %xmm3
-; SSE2-NEXT: packssdw %xmm2, %xmm3
-; SSE2-NEXT: movdqa %xmm12, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm12
-; SSE2-NEXT: pandn %xmm10, %xmm2
-; SSE2-NEXT: por %xmm12, %xmm2
-; SSE2-NEXT: pxor %xmm9, %xmm8
-; SSE2-NEXT: movdqa %xmm8, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm11, %xmm8
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm9
-; SSE2-NEXT: pandn %xmm10, %xmm1
-; SSE2-NEXT: por %xmm9, %xmm1
-; SSE2-NEXT: packssdw %xmm2, %xmm1
-; SSE2-NEXT: packssdw %xmm1, %xmm3
-; SSE2-NEXT: packsswb %xmm3, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_ssat_v16i64_v16i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [127,127]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm6, %xmm9
-; SSSE3-NEXT: pxor %xmm8, %xmm9
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [2147483775,2147483775]
-; SSSE3-NEXT: movdqa %xmm11, %xmm12
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm12
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm9
-; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm9[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm14
-; SSSE3-NEXT: pshufd {{.*#+}} xmm9 = xmm12[1,1,3,3]
-; SSSE3-NEXT: por %xmm14, %xmm9
-; SSSE3-NEXT: pand %xmm9, %xmm6
-; SSSE3-NEXT: pandn %xmm10, %xmm9
-; SSSE3-NEXT: por %xmm6, %xmm9
-; SSSE3-NEXT: movdqa %xmm7, %xmm6
-; SSSE3-NEXT: pxor %xmm8, %xmm6
-; SSSE3-NEXT: movdqa %xmm11, %xmm12
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm12
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm12
-; SSSE3-NEXT: pand %xmm12, %xmm7
-; SSSE3-NEXT: pandn %xmm10, %xmm12
-; SSSE3-NEXT: por %xmm7, %xmm12
-; SSSE3-NEXT: movdqa %xmm4, %xmm6
-; SSSE3-NEXT: pxor %xmm8, %xmm6
-; SSSE3-NEXT: movdqa %xmm11, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm13
-; SSSE3-NEXT: pand %xmm13, %xmm4
-; SSSE3-NEXT: pandn %xmm10, %xmm13
-; SSSE3-NEXT: por %xmm4, %xmm13
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: movdqa %xmm11, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm14
-; SSSE3-NEXT: pand %xmm14, %xmm5
-; SSSE3-NEXT: pandn %xmm10, %xmm14
-; SSSE3-NEXT: por %xmm5, %xmm14
-; SSSE3-NEXT: movdqa %xmm2, %xmm4
-; SSSE3-NEXT: pxor %xmm8, %xmm4
-; SSSE3-NEXT: movdqa %xmm11, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm10, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm8, %xmm2
-; SSSE3-NEXT: movdqa %xmm11, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pandn %xmm10, %xmm6
-; SSSE3-NEXT: por %xmm3, %xmm6
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pxor %xmm8, %xmm2
-; SSSE3-NEXT: movdqa %xmm11, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm0
-; SSSE3-NEXT: pandn %xmm10, %xmm3
-; SSSE3-NEXT: por %xmm0, %xmm3
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm8, %xmm0
-; SSSE3-NEXT: movdqa %xmm11, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm10, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [18446744073709551488,18446744073709551488]
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm8, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm11 = [18446744071562067840,18446744071562067840]
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm2, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm4
-; SSSE3-NEXT: pandn %xmm10, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm8, %xmm0
-; SSSE3-NEXT: movdqa %xmm0, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm3
-; SSSE3-NEXT: pandn %xmm10, %xmm0
-; SSSE3-NEXT: por %xmm3, %xmm0
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm6, %xmm1
-; SSSE3-NEXT: pxor %xmm8, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm6
-; SSSE3-NEXT: pandn %xmm10, %xmm2
-; SSSE3-NEXT: por %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm8, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm5
-; SSSE3-NEXT: pandn %xmm10, %xmm3
-; SSSE3-NEXT: por %xmm5, %xmm3
-; SSSE3-NEXT: packssdw %xmm2, %xmm3
-; SSSE3-NEXT: packssdw %xmm3, %xmm0
-; SSSE3-NEXT: movdqa %xmm14, %xmm1
-; SSSE3-NEXT: pxor %xmm8, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm14
-; SSSE3-NEXT: pandn %xmm10, %xmm2
-; SSSE3-NEXT: por %xmm14, %xmm2
-; SSSE3-NEXT: movdqa %xmm13, %xmm1
-; SSSE3-NEXT: pxor %xmm8, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm13
-; SSSE3-NEXT: pandn %xmm10, %xmm3
-; SSSE3-NEXT: por %xmm13, %xmm3
-; SSSE3-NEXT: packssdw %xmm2, %xmm3
-; SSSE3-NEXT: movdqa %xmm12, %xmm1
-; SSSE3-NEXT: pxor %xmm8, %xmm1
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm12
-; SSSE3-NEXT: pandn %xmm10, %xmm2
-; SSSE3-NEXT: por %xmm12, %xmm2
-; SSSE3-NEXT: pxor %xmm9, %xmm8
-; SSSE3-NEXT: movdqa %xmm8, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm11, %xmm8
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm8[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm9
-; SSSE3-NEXT: pandn %xmm10, %xmm1
-; SSSE3-NEXT: por %xmm9, %xmm1
-; SSSE3-NEXT: packssdw %xmm2, %xmm1
-; SSSE3-NEXT: packssdw %xmm1, %xmm3
-; SSSE3-NEXT: packsswb %xmm3, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_ssat_v16i64_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movapd {{.*#+}} xmm11 = [127,127]
-; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [2147483648,2147483648]
-; SSE41-NEXT: movdqa %xmm6, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm12 = [2147483775,2147483775]
-; SSE41-NEXT: movdqa %xmm12, %xmm10
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm10
-; SSE41-NEXT: movdqa %xmm12, %xmm13
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
-; SSE41-NEXT: pand %xmm10, %xmm0
-; SSE41-NEXT: por %xmm13, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm10
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm10
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm13
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm13
-; SSE41-NEXT: movdqa %xmm12, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm13, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm13
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm13
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: movdqa %xmm12, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm14
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm14
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm4
-; SSE41-NEXT: movdqa %xmm12, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm15
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm15
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm12, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm5
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm12, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm12, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm11, %xmm7
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm7
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm9, %xmm0
-; SSE41-NEXT: movdqa %xmm12, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm12
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm12, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm11
-; SSE41-NEXT: movapd {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488]
-; SSE41-NEXT: movapd %xmm11, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [18446744071562067840,18446744071562067840]
-; SSE41-NEXT: movapd %xmm1, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm11, %xmm4
-; SSE41-NEXT: movapd %xmm7, %xmm1
-; SSE41-NEXT: xorpd %xmm9, %xmm1
-; SSE41-NEXT: movapd %xmm1, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: packssdw %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm6, %xmm3
-; SSE41-NEXT: xorpd %xmm9, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm3
-; SSE41-NEXT: movapd %xmm5, %xmm4
-; SSE41-NEXT: xorpd %xmm9, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm4
-; SSE41-NEXT: packssdw %xmm3, %xmm4
-; SSE41-NEXT: packssdw %xmm4, %xmm1
-; SSE41-NEXT: movapd %xmm15, %xmm3
-; SSE41-NEXT: xorpd %xmm9, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm4
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm15, %xmm3
-; SSE41-NEXT: movapd %xmm14, %xmm4
-; SSE41-NEXT: xorpd %xmm9, %xmm4
-; SSE41-NEXT: movapd %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm4
-; SSE41-NEXT: blendvpd %xmm0, %xmm14, %xmm4
-; SSE41-NEXT: packssdw %xmm3, %xmm4
-; SSE41-NEXT: movapd %xmm13, %xmm3
-; SSE41-NEXT: xorpd %xmm9, %xmm3
-; SSE41-NEXT: movapd %xmm3, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm2, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm13, %xmm3
-; SSE41-NEXT: xorpd %xmm10, %xmm9
-; SSE41-NEXT: movapd %xmm9, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm8, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm9
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm9, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm10, %xmm2
-; SSE41-NEXT: packssdw %xmm3, %xmm2
-; SSE41-NEXT: packssdw %xmm2, %xmm4
-; SSE41-NEXT: packsswb %xmm4, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm8
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [127,127]
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm5, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm5, %xmm10
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm4, %xmm5, %xmm11
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm5, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm7, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm9, %xmm5, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm9, %xmm5, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm0
-; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm5, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm8, %xmm5, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm8, %xmm5, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm5, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm7, %xmm5, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm5, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm11, %xmm7
-; AVX1-NEXT: vblendvpd %xmm7, %xmm11, %xmm5, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm10, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm10, %xmm5, %xmm3
-; AVX1-NEXT: vpackssdw %xmm8, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackssdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm7, %xmm3, %xmm2
-; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ssat_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [127,127,127,127]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm4, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm4, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1
-; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_ssat_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [127,127,127,127,127,127,127,127]
-; AVX512-NEXT: vpminsq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpminsq %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX512-NEXT: vpmaxsq %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpmaxsq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp slt <16 x i64> %a0, <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
- %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127, i64 127>
- %3 = icmp sgt <16 x i64> %2, <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
- %4 = select <16 x i1> %3, <16 x i64> %2, <16 x i64> <i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128, i64 -128>
- %5 = trunc <16 x i64> %4 to <16 x i8>
- ret <16 x i8> %5
-}
-
-define <8 x i8> @trunc_ssat_v8i32_v8i8(<8 x i32> %a0) {
-; SSE-LABEL: trunc_ssat_v8i32_v8i8:
-; SSE: # %bb.0:
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packsswb %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v8i32_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ssat_v8i32_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_ssat_v8i32_v8i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
-; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
-; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
-; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpminsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmaxsd {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp slt <8 x i32> %a0, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
- %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
- %3 = icmp sgt <8 x i32> %2, <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128>
- %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128>
- %5 = trunc <8 x i32> %4 to <8 x i8>
- ret <8 x i8> %5
-}
-
-define void @trunc_ssat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
-; SSE-LABEL: trunc_ssat_v8i32_v8i8_store:
-; SSE: # %bb.0:
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packsswb %xmm0, %xmm0
-; SSE-NEXT: movq %xmm0, (%rdi)
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v8i32_v8i8_store:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, (%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ssat_v8i32_v8i8_store:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovq %xmm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_ssat_v8i32_v8i8_store:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
-; AVX512F-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, (%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_ssat_v8i32_v8i8_store:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovsdb %ymm0, (%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_ssat_v8i32_v8i8_store:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [127,127,127,127,127,127,127,127]
-; AVX512BW-NEXT: vpminsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168,4294967168]
-; AVX512BW-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, (%rdi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_ssat_v8i32_v8i8_store:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovsdb %ymm0, (%rdi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp slt <8 x i32> %a0, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
- %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
- %3 = icmp sgt <8 x i32> %2, <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128>
- %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128>
- %5 = trunc <8 x i32> %4 to <8 x i8>
- store <8 x i8> %5, <8 x i8> *%p1
- ret void
-}
-
-define <16 x i8> @trunc_ssat_v16i32_v16i8(<16 x i32> %a0) {
-; SSE-LABEL: trunc_ssat_v16i32_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: packssdw %xmm3, %xmm2
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packsswb %xmm2, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ssat_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_ssat_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovsdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp slt <16 x i32> %a0, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
- %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
- %3 = icmp sgt <16 x i32> %2, <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128>
- %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> <i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128, i32 -128>
- %5 = trunc <16 x i32> %4 to <16 x i8>
- ret <16 x i8> %5
-}
-
-define <16 x i8> @trunc_ssat_v16i16_v16i8(<16 x i16> %a0) {
-; SSE-LABEL: trunc_ssat_v16i16_v16i8:
-; SSE: # %bb.0:
-; SSE-NEXT: packsswb %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ssat_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_ssat_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_ssat_v16i16_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VL-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_ssat_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpminsw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vpmaxsw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_ssat_v16i16_v16i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovswb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp slt <16 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
- %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
- %3 = icmp sgt <16 x i16> %2, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
- %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
- %5 = trunc <16 x i16> %4 to <16 x i8>
- ret <16 x i8> %5
-}
-
-define <32 x i8> @trunc_ssat_v32i16_v32i8(<32 x i16> %a0) {
-; SSE-LABEL: trunc_ssat_v32i16_v32i8:
-; SSE: # %bb.0:
-; SSE-NEXT: packsswb %xmm1, %xmm0
-; SSE-NEXT: packsswb %xmm3, %xmm2
-; SSE-NEXT: movdqa %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc_ssat_v32i16_v32i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpacksswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_ssat_v32i16_v32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpacksswb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_ssat_v32i16_v32i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512F-NEXT: vpminsw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpminsw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408]
-; AVX512F-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_ssat_v32i16_v32i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
-; AVX512VL-NEXT: vpminsw %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpminsw %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408,65408]
-; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpmaxsw %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_ssat_v32i16_v32i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovswb %zmm0, %ymm0
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_ssat_v32i16_v32i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovswb %zmm0, %ymm0
-; AVX512BWVL-NEXT: retq
- %1 = icmp slt <32 x i16> %a0, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
- %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127>
- %3 = icmp sgt <32 x i16> %2, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
- %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128>
- %5 = trunc <32 x i16> %4 to <32 x i8>
- ret <32 x i8> %5
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
-
-;
-; Unsigned saturation truncation to vXi32
-;
-
-define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
-; SSE2-LABEL: trunc_usat_v4i64_v4i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm5
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm3
-; SSE2-NEXT: por %xmm1, %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn {{.*}}(%rip), %xmm5
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v4i64_v4i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm2, %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
-; SSSE3-NEXT: movdqa %xmm4, %xmm5
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm5
-; SSSE3-NEXT: pxor %xmm1, %xmm2
-; SSSE3-NEXT: movdqa %xmm4, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm3
-; SSSE3-NEXT: por %xmm1, %xmm3
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pandn {{.*}}(%rip), %xmm5
-; SSSE3-NEXT: por %xmm5, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v4i64_v4i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm2
-; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm2, %xmm3
-; SSE41-NEXT: pxor %xmm0, %xmm3
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm3, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm3
-; SSE41-NEXT: por %xmm6, %xmm3
-; SSE41-NEXT: pxor %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4294967295,4294967295]
-; SSE41-NEXT: movapd {{.*#+}} xmm5 = [4294967295,429496729]
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE41-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
-; SSE41-NEXT: movaps %xmm4, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v4i64_v4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [9223372041149743103,9223372041149743103]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpxor %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,429496729]
-; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm3, %xmm1
-; AVX1-NEXT: vmovapd {{.*#+}} xmm3 = [4294967295,4294967295]
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_usat_v4i64_v4i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-SLOW-NEXT: vpxor %ymm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
-; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_usat_v4i64_v4i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-FAST-NEXT: vpxor %ymm1, %ymm0, %ymm1
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-FAST-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,429496729]
-; AVX2-FAST-NEXT: vblendvpd %ymm1, %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm1 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc_usat_v4i64_v4i32:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
-; AVX512F-NEXT: vpcmpltuq %zmm1, %zmm0, %k1
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
-; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_usat_v4i64_v4i32:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpcmpltuq {{.*}}(%rip){1to4}, %ymm0, %k1
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
-; AVX512VL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
-; AVX512VL-NEXT: vpmovqd %ymm1, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_usat_v4i64_v4i32:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
-; AVX512BW-NEXT: vpcmpltuq %zmm1, %zmm0, %k1
-; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
-; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_usat_v4i64_v4i32:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpcmpltuq {{.*}}(%rip){1to4}, %ymm0, %k1
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,429496729]
-; AVX512BWVL-NEXT: vmovdqa64 %ymm0, %ymm1 {%k1}
-; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp ult <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
- %2 = select <4 x i1> %1, <4 x i64> %a0, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 429496729>
- %3 = trunc <4 x i64> %2 to <4 x i32>
- ret <4 x i32> %3
-}
-
-define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64> %a0) {
-; SSE2-LABEL: trunc_usat_v8i64_v8i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm3, %xmm7
-; SSE2-NEXT: pxor %xmm5, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm3
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; SSE2-NEXT: movdqa %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm0, %xmm5
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm2, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
-; SSE2-NEXT: movaps %xmm3, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v8i64_v8i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [4294967295,4294967295]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm3, %xmm7
-; SSSE3-NEXT: pxor %xmm5, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259455,9223372039002259455]
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pxor %xmm5, %xmm3
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
-; SSSE3-NEXT: movdqa %xmm1, %xmm2
-; SSSE3-NEXT: pxor %xmm5, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pxor %xmm0, %xmm5
-; SSSE3-NEXT: movdqa %xmm9, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm2, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2]
-; SSSE3-NEXT: movaps %xmm3, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v8i64_v8i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movapd {{.*#+}} xmm9 = [4294967295,4294967295]
-; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259455,9223372039002259455]
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm6
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm6, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm6
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm6
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm4, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm7, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm3
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm3
-; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: pxor %xmm8, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm5, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm5, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm9
-; SSE41-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm2[0,2]
-; SSE41-NEXT: movaps %xmm9, %xmm0
-; SSE41-NEXT: movaps %xmm3, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v8i64_v8i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372041149743103,9223372041149743103]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [4294967295,4294967295]
-; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc_usat_v8i64_v8i32:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm4
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc_usat_v8i64_v8i32:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-FAST-NEXT: vpxor %ymm3, %ymm1, %ymm4
-; AVX2-FAST-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103]
-; AVX2-FAST-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-FAST-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vpxor %ymm3, %ymm0, %ymm3
-; AVX2-FAST-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-FAST-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc_usat_v8i64_v8i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovusqd %zmm0, %ymm0
-; AVX512-NEXT: retq
- %1 = icmp ult <8 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
- %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
- %3 = trunc <8 x i64> %2 to <8 x i32>
- ret <8 x i32> %3
-}
-
-;
-; Unsigned saturation truncation to vXi16
-;
-
-define <8 x i16> @trunc_usat_v8i64_v8i16(<8 x i64> %a0) {
-; SSE2-LABEL: trunc_usat_v8i64_v8i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm2, %xmm5
-; SSE2-NEXT: pxor %xmm6, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
-; SSE2-NEXT: movdqa %xmm9, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm4, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm6, %xmm2
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm6, %xmm3
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm7, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm6
-; SSE2-NEXT: movdqa %xmm9, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm6, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v8i64_v8i16:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [65535,65535]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm2, %xmm5
-; SSSE3-NEXT: pxor %xmm6, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002324991,9223372039002324991]
-; SSSE3-NEXT: movdqa %xmm9, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm4, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm6, %xmm2
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm6, %xmm3
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm7, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm1, %xmm6
-; SSSE3-NEXT: movdqa %xmm9, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm3, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v8i64_v8i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movapd {{.*#+}} xmm9 = [65535,65535]
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002324991,9223372039002324991]
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
-; SSE41-NEXT: packusdw %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm2, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9
-; SSE41-NEXT: packusdw %xmm5, %xmm9
-; SSE41-NEXT: packusdw %xmm9, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v8i64_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854841343,9223372036854841343]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [65535,65535]
-; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v8i64_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm4
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841343,9223372036854841343,9223372036854841343,9223372036854841343]
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_usat_v8i64_v8i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovusqw %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp ult <8 x i64> %a0, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
- %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
- %3 = trunc <8 x i64> %2 to <8 x i16>
- ret <8 x i16> %3
-}
-
-define <8 x i16> @trunc_usat_v8i32_v8i16(<8 x i32> %a0) {
-; SSE2-LABEL: trunc_usat_v8i32_v8i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm2, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm4, %xmm5
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm3, %xmm5
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm4
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pslld $16, %xmm4
-; SSE2-NEXT: psrad $16, %xmm4
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm4, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v8i32_v8i16:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm6
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm5
-; SSSE3-NEXT: por %xmm1, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm1, %xmm5
-; SSSE3-NEXT: pshufb %xmm1, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v8i32_v8i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; SSE41-NEXT: pminud %xmm2, %xmm1
-; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v8i32_v8i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vpminud %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpminud %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v8i32_v8i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_usat_v8i32_v8i16:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_usat_v8i32_v8i16:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovusdw %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_usat_v8i32_v8i16:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i16:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovusdw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp ult <8 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
- %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
- %3 = trunc <8 x i32> %2 to <8 x i16>
- ret <8 x i16> %3
-}
-
-define <16 x i16> @trunc_usat_v16i32_v16i16(<16 x i32> %a0) {
-; SSE2-LABEL: trunc_usat_v16i32_v16i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm1, %xmm8
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm2, %xmm7
-; SSE2-NEXT: pxor %xmm6, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm1
-; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pxor %xmm7, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm6, %xmm4
-; SSE2-NEXT: movdqa %xmm5, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm7, %xmm2
-; SSE2-NEXT: por %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm6, %xmm3
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm7, %xmm4
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm8, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT: pxor %xmm5, %xmm7
-; SSE2-NEXT: pand %xmm8, %xmm5
-; SSE2-NEXT: por %xmm7, %xmm5
-; SSE2-NEXT: pslld $16, %xmm5
-; SSE2-NEXT: psrad $16, %xmm5
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm5, %xmm0
-; SSE2-NEXT: pslld $16, %xmm2
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: packssdw %xmm2, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v16i32_v16i16:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm1, %xmm8
-; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm2, %xmm7
-; SSSE3-NEXT: pxor %xmm6, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147549183,2147549183,2147549183,2147549183]
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm1
-; SSSE3-NEXT: pcmpeqd %xmm7, %xmm7
-; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: pxor %xmm7, %xmm1
-; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm6, %xmm4
-; SSSE3-NEXT: movdqa %xmm5, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm3
-; SSSE3-NEXT: pxor %xmm7, %xmm2
-; SSSE3-NEXT: por %xmm3, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm6, %xmm3
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm7, %xmm4
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm8, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT: pxor %xmm5, %xmm7
-; SSSE3-NEXT: pand %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm7, %xmm5
-; SSSE3-NEXT: pslld $16, %xmm5
-; SSSE3-NEXT: psrad $16, %xmm5
-; SSSE3-NEXT: pslld $16, %xmm0
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: packssdw %xmm5, %xmm0
-; SSSE3-NEXT: pslld $16, %xmm2
-; SSSE3-NEXT: psrad $16, %xmm2
-; SSSE3-NEXT: pslld $16, %xmm1
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: packssdw %xmm2, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v16i32_v16i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535]
-; SSE41-NEXT: pminud %xmm4, %xmm3
-; SSE41-NEXT: pminud %xmm4, %xmm2
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: pminud %xmm4, %xmm1
-; SSE41-NEXT: pminud %xmm4, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v16i32_v16i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [65535,65535,65535,65535]
-; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v16i32_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_usat_v16i32_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovusdw %zmm0, %ymm0
-; AVX512-NEXT: retq
- %1 = icmp ult <16 x i32> %a0, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
- %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
- %3 = trunc <16 x i32> %2 to <16 x i16>
- ret <16 x i16> %3
-}
-
-;
-; Unsigned saturation truncation to v16i8
-;
-
-define <8 x i8> @trunc_usat_v8i64_v8i8(<8 x i64> %a0) {
-; SSE2-LABEL: trunc_usat_v8i64_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa %xmm0, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: pxor %xmm6, %xmm0
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT: movdqa %xmm9, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm1, %xmm5
-; SSE2-NEXT: movdqa %xmm4, %xmm0
-; SSE2-NEXT: pxor %xmm6, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm5, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm6, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm5, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm6
-; SSE2-NEXT: movdqa %xmm9, %xmm1
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm2, %xmm1
-; SSE2-NEXT: packuswb %xmm4, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v8i64_v8i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pxor %xmm6, %xmm0
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
-; SSSE3-NEXT: movdqa %xmm9, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm1, %xmm5
-; SSSE3-NEXT: movdqa %xmm4, %xmm0
-; SSSE3-NEXT: pxor %xmm6, %xmm0
-; SSSE3-NEXT: movdqa %xmm9, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm4, %xmm0
-; SSSE3-NEXT: packuswb %xmm5, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm6, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm6
-; SSSE3-NEXT: movdqa %xmm9, %xmm1
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; SSSE3-NEXT: pand %xmm3, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm2, %xmm1
-; SSSE3-NEXT: packuswb %xmm4, %xmm1
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: packuswb %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v8i64_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movapd {{.*#+}} xmm9 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
-; SSE41-NEXT: packusdw %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm2, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9
-; SSE41-NEXT: packusdw %xmm5, %xmm9
-; SSE41-NEXT: packusdw %xmm9, %xmm1
-; SSE41-NEXT: packuswb %xmm1, %xmm1
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v8i64_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255]
-; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v8i64_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm4
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063]
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_usat_v8i64_v8i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp ult <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %3 = trunc <8 x i64> %2 to <8 x i8>
- ret <8 x i8> %3
-}
-
-define void @trunc_usat_v8i64_v8i8_store(<8 x i64> %a0, <8 x i8> *%p1) {
-; SSE2-LABEL: trunc_usat_v8i64_v8i8_store:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: pxor %xmm5, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm9, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm6
-; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm10, %xmm7
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; SSE2-NEXT: por %xmm7, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm4, %xmm1
-; SSE2-NEXT: movdqa %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm5, %xmm0
-; SSE2-NEXT: movdqa %xmm9, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: pand %xmm6, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSE2-NEXT: por %xmm0, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm5
-; SSE2-NEXT: movdqa %xmm9, %xmm0
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm9, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm0
-; SSE2-NEXT: pand %xmm0, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm0
-; SSE2-NEXT: por %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm1
-; SSE2-NEXT: packuswb %xmm0, %xmm1
-; SSE2-NEXT: movq %xmm1, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v8i64_v8i8_store:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm1, %xmm7
-; SSSE3-NEXT: pxor %xmm5, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259711,9223372039002259711]
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm5, %xmm1
-; SSSE3-NEXT: movdqa %xmm9, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm6
-; SSSE3-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm10, %xmm7
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; SSSE3-NEXT: por %xmm7, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm0, %xmm1
-; SSSE3-NEXT: packuswb %xmm4, %xmm1
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm5, %xmm0
-; SSSE3-NEXT: movdqa %xmm9, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm0, %xmm4
-; SSSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; SSSE3-NEXT: por %xmm0, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm5
-; SSSE3-NEXT: movdqa %xmm9, %xmm0
-; SSSE3-NEXT: pcmpgtd %xmm5, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm9, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; SSSE3-NEXT: pand %xmm3, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm0
-; SSSE3-NEXT: pand %xmm0, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm0
-; SSSE3-NEXT: por %xmm2, %xmm0
-; SSSE3-NEXT: packuswb %xmm4, %xmm0
-; SSSE3-NEXT: packuswb %xmm0, %xmm1
-; SSSE3-NEXT: packuswb %xmm0, %xmm1
-; SSSE3-NEXT: movq %xmm1, (%rdi)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v8i64_v8i8_store:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movapd {{.*#+}} xmm9 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm1
-; SSE41-NEXT: packusdw %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm7, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm5
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm5
-; SSE41-NEXT: movdqa %xmm4, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm6
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; SSE41-NEXT: pand %xmm5, %xmm0
-; SSE41-NEXT: por %xmm6, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm5
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm5
-; SSE41-NEXT: pxor %xmm2, %xmm7
-; SSE41-NEXT: movdqa %xmm4, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm7, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm4, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm9
-; SSE41-NEXT: packusdw %xmm5, %xmm9
-; SSE41-NEXT: packusdw %xmm9, %xmm1
-; SSE41-NEXT: packuswb %xmm0, %xmm1
-; SSE41-NEXT: movq %xmm1, (%rdi)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v8i64_v8i8_store:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [9223372036854776063,9223372036854776063]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpxor %xmm2, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm6
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vmovapd {{.*#+}} xmm4 = [255,255]
-; AVX1-NEXT: vblendvpd %xmm2, %xmm3, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm2
-; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, (%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v8i64_v8i8_store:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [255,255,255,255]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm4
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063]
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vmovq %xmm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_usat_v8i64_v8i8_store:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovusqb %zmm0, (%rdi)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp ult <8 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %2 = select <8 x i1> %1, <8 x i64> %a0, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %3 = trunc <8 x i64> %2 to <8 x i8>
- store <8 x i8> %3, <8 x i8> *%p1
- ret void
-}
-
-define <16 x i8> @trunc_usat_v16i64_v16i8(<16 x i64> %a0) {
-; SSE2-LABEL: trunc_usat_v16i64_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
-; SSE2-NEXT: movdqa %xmm1, %xmm11
-; SSE2-NEXT: pxor %xmm9, %xmm11
-; SSE2-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711]
-; SSE2-NEXT: movdqa %xmm10, %xmm12
-; SSE2-NEXT: pcmpgtd %xmm11, %xmm12
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
-; SSE2-NEXT: por %xmm11, %xmm12
-; SSE2-NEXT: pand %xmm12, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm12
-; SSE2-NEXT: por %xmm1, %xmm12
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: movdqa %xmm10, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm13, %xmm14
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm14, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm12, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: movdqa %xmm10, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm13
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm13, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm2, %xmm3
-; SSE2-NEXT: pxor %xmm9, %xmm3
-; SSE2-NEXT: movdqa %xmm10, %xmm11
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm11
-; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3]
-; SSE2-NEXT: pand %xmm12, %xmm13
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3]
-; SSE2-NEXT: por %xmm13, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: packuswb %xmm1, %xmm3
-; SSE2-NEXT: packuswb %xmm3, %xmm0
-; SSE2-NEXT: movdqa %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm3, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm1, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm5
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm5, %xmm2
-; SSE2-NEXT: movdqa %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm9, %xmm1
-; SSE2-NEXT: movdqa %xmm10, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSE2-NEXT: pand %xmm11, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm1
-; SSE2-NEXT: pand %xmm1, %xmm4
-; SSE2-NEXT: pandn %xmm8, %xmm1
-; SSE2-NEXT: por %xmm4, %xmm1
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: movdqa %xmm7, %xmm2
-; SSE2-NEXT: pxor %xmm9, %xmm2
-; SSE2-NEXT: movdqa %xmm10, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm2, %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSE2-NEXT: por %xmm2, %xmm3
-; SSE2-NEXT: pand %xmm3, %xmm7
-; SSE2-NEXT: pandn %xmm8, %xmm3
-; SSE2-NEXT: por %xmm7, %xmm3
-; SSE2-NEXT: pxor %xmm6, %xmm9
-; SSE2-NEXT: movdqa %xmm10, %xmm2
-; SSE2-NEXT: pcmpgtd %xmm9, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSE2-NEXT: pcmpeqd %xmm10, %xmm9
-; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
-; SSE2-NEXT: pand %xmm4, %xmm5
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSE2-NEXT: por %xmm5, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm6
-; SSE2-NEXT: pandn %xmm8, %xmm2
-; SSE2-NEXT: por %xmm6, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm1
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v16i64_v16i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm9 = [9223372039002259456,9223372039002259456]
-; SSSE3-NEXT: movdqa %xmm1, %xmm11
-; SSSE3-NEXT: pxor %xmm9, %xmm11
-; SSSE3-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711]
-; SSSE3-NEXT: movdqa %xmm10, %xmm12
-; SSSE3-NEXT: pcmpgtd %xmm11, %xmm12
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm12[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
-; SSSE3-NEXT: por %xmm11, %xmm12
-; SSSE3-NEXT: pand %xmm12, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm12
-; SSSE3-NEXT: por %xmm1, %xmm12
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: movdqa %xmm10, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm11[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm14 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm13, %xmm14
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3]
-; SSSE3-NEXT: por %xmm14, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm1, %xmm0
-; SSSE3-NEXT: packuswb %xmm12, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: movdqa %xmm10, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm12, %xmm13
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,3,3]
-; SSSE3-NEXT: por %xmm13, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm3, %xmm1
-; SSSE3-NEXT: movdqa %xmm2, %xmm3
-; SSSE3-NEXT: pxor %xmm9, %xmm3
-; SSSE3-NEXT: movdqa %xmm10, %xmm11
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm11
-; SSSE3-NEXT: pshufd {{.*#+}} xmm12 = xmm11[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm13 = xmm3[1,1,3,3]
-; SSSE3-NEXT: pand %xmm12, %xmm13
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,3,3]
-; SSSE3-NEXT: por %xmm13, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: packuswb %xmm1, %xmm3
-; SSSE3-NEXT: packuswb %xmm3, %xmm0
-; SSSE3-NEXT: movdqa %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: movdqa %xmm10, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm3, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm1, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm5
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm5, %xmm2
-; SSSE3-NEXT: movdqa %xmm4, %xmm1
-; SSSE3-NEXT: pxor %xmm9, %xmm1
-; SSSE3-NEXT: movdqa %xmm10, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; SSSE3-NEXT: pand %xmm11, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm1
-; SSSE3-NEXT: pand %xmm1, %xmm4
-; SSSE3-NEXT: pandn %xmm8, %xmm1
-; SSSE3-NEXT: por %xmm4, %xmm1
-; SSSE3-NEXT: packuswb %xmm2, %xmm1
-; SSSE3-NEXT: movdqa %xmm7, %xmm2
-; SSSE3-NEXT: pxor %xmm9, %xmm2
-; SSSE3-NEXT: movdqa %xmm10, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm2, %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; SSSE3-NEXT: por %xmm2, %xmm3
-; SSSE3-NEXT: pand %xmm3, %xmm7
-; SSSE3-NEXT: pandn %xmm8, %xmm3
-; SSSE3-NEXT: por %xmm7, %xmm3
-; SSSE3-NEXT: pxor %xmm6, %xmm9
-; SSSE3-NEXT: movdqa %xmm10, %xmm2
-; SSSE3-NEXT: pcmpgtd %xmm9, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
-; SSSE3-NEXT: pcmpeqd %xmm10, %xmm9
-; SSSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm9[1,1,3,3]
-; SSSE3-NEXT: pand %xmm4, %xmm5
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; SSSE3-NEXT: por %xmm5, %xmm2
-; SSSE3-NEXT: pand %xmm2, %xmm6
-; SSSE3-NEXT: pandn %xmm8, %xmm2
-; SSSE3-NEXT: por %xmm6, %xmm2
-; SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSSE3-NEXT: packuswb %xmm2, %xmm1
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v16i64_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: movapd {{.*#+}} xmm9 = [255,255]
-; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [9223372039002259456,9223372039002259456]
-; SSE41-NEXT: movdqa %xmm1, %xmm0
-; SSE41-NEXT: pxor %xmm11, %xmm0
-; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [9223372039002259711,9223372039002259711]
-; SSE41-NEXT: movdqa %xmm10, %xmm12
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm12
-; SSE41-NEXT: movdqa %xmm10, %xmm13
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm13
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm13[0,0,2,2]
-; SSE41-NEXT: pand %xmm12, %xmm0
-; SSE41-NEXT: por %xmm13, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm12
-; SSE41-NEXT: blendvpd %xmm0, %xmm1, %xmm12
-; SSE41-NEXT: movdqa %xmm8, %xmm0
-; SSE41-NEXT: pxor %xmm11, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm13
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm13
-; SSE41-NEXT: movdqa %xmm10, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm13, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm13
-; SSE41-NEXT: blendvpd %xmm0, %xmm8, %xmm13
-; SSE41-NEXT: packusdw %xmm12, %xmm13
-; SSE41-NEXT: movdqa %xmm3, %xmm0
-; SSE41-NEXT: pxor %xmm11, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm8
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm8
-; SSE41-NEXT: movdqa %xmm10, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm8, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm8
-; SSE41-NEXT: blendvpd %xmm0, %xmm3, %xmm8
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: pxor %xmm11, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm3
-; SSE41-NEXT: movdqa %xmm10, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm1, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: packusdw %xmm8, %xmm1
-; SSE41-NEXT: packusdw %xmm1, %xmm13
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: pxor %xmm11, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm10, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm2, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm5, %xmm1
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: pxor %xmm11, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm2
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm2
-; SSE41-NEXT: movdqa %xmm10, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm2, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm2
-; SSE41-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE41-NEXT: packusdw %xmm1, %xmm2
-; SSE41-NEXT: movdqa %xmm7, %xmm0
-; SSE41-NEXT: pxor %xmm11, %xmm0
-; SSE41-NEXT: movdqa %xmm10, %xmm1
-; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
-; SSE41-NEXT: movdqa %xmm10, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm0, %xmm3
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; SSE41-NEXT: pand %xmm1, %xmm0
-; SSE41-NEXT: por %xmm3, %xmm0
-; SSE41-NEXT: movapd %xmm9, %xmm1
-; SSE41-NEXT: blendvpd %xmm0, %xmm7, %xmm1
-; SSE41-NEXT: pxor %xmm6, %xmm11
-; SSE41-NEXT: movdqa %xmm10, %xmm3
-; SSE41-NEXT: pcmpeqd %xmm11, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm11, %xmm10
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm10[0,0,2,2]
-; SSE41-NEXT: pand %xmm3, %xmm0
-; SSE41-NEXT: por %xmm10, %xmm0
-; SSE41-NEXT: blendvpd %xmm0, %xmm6, %xmm9
-; SSE41-NEXT: packusdw %xmm1, %xmm9
-; SSE41-NEXT: packusdw %xmm9, %xmm2
-; SSE41-NEXT: packuswb %xmm2, %xmm13
-; SSE41-NEXT: movdqa %xmm13, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v16i64_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa %ymm0, %ymm8
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vpxor %xmm5, %xmm8, %xmm4
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372036854776063,9223372036854776063]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm11
-; AVX1-NEXT: vpxor %xmm5, %xmm11, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-NEXT: vpxor %xmm5, %xmm1, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm10
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm14
-; AVX1-NEXT: vpxor %xmm5, %xmm14, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm12
-; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm13
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT: vpxor %xmm5, %xmm7, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm15
-; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0
-; AVX1-NEXT: vpxor %xmm5, %xmm0, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [255,255]
-; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm6, %xmm9
-; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3
-; AVX1-NEXT: vblendvpd %xmm15, %xmm7, %xmm6, %xmm4
-; AVX1-NEXT: vblendvpd %xmm13, %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vblendvpd %xmm12, %xmm14, %xmm6, %xmm5
-; AVX1-NEXT: vblendvpd %xmm10, %xmm1, %xmm6, %xmm1
-; AVX1-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vblendvpd %xmm0, %xmm11, %xmm6, %xmm7
-; AVX1-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX1-NEXT: vblendvpd %xmm0, %xmm8, %xmm6, %xmm6
-; AVX1-NEXT: vpackusdw %xmm9, %xmm3, %xmm0
-; AVX1-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpackusdw %xmm5, %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm7, %xmm6, %xmm2
-; AVX1-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v16i64_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [255,255,255,255]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm5, %ymm1, %ymm6
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm7 = [9223372036854776063,9223372036854776063,9223372036854776063,9223372036854776063]
-; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6
-; AVX2-NEXT: vblendvpd %ymm6, %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpxor %ymm5, %ymm0, %ymm6
-; AVX2-NEXT: vpcmpgtq %ymm6, %ymm7, %ymm6
-; AVX2-NEXT: vblendvpd %ymm6, %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm7, %ymm1
-; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1
-; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm7, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_usat_v16i64_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255]
-; AVX512-NEXT: vpminuq %zmm2, %zmm1, %zmm1
-; AVX512-NEXT: vpminuq %zmm2, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp ult <16 x i64> %a0, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %2 = select <16 x i1> %1, <16 x i64> %a0, <16 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
- %3 = trunc <16 x i64> %2 to <16 x i8>
- ret <16 x i8> %3
-}
-
-define <8 x i8> @trunc_usat_v8i32_v8i8(<8 x i32> %a0) {
-; SSE2-LABEL: trunc_usat_v8i32_v8i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm5
-; SSE2-NEXT: pxor %xmm4, %xmm5
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm2, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm5, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pandn %xmm3, %xmm6
-; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm0, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm2
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: pandn %xmm3, %xmm2
-; SSE2-NEXT: por %xmm0, %xmm2
-; SSE2-NEXT: packuswb %xmm6, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm2, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v8i32_v8i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm6
-; SSSE3-NEXT: por %xmm6, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm5
-; SSSE3-NEXT: por %xmm1, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm1, %xmm5
-; SSSE3-NEXT: pshufb %xmm1, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v8i32_v8i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: pminud %xmm2, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSE41-NEXT: pshufb %xmm2, %xmm1
-; SSE41-NEXT: pshufb %xmm2, %xmm0
-; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v8i32_v8i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v8i32_v8i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_usat_v8i32_v8i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_usat_v8i32_v8i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_usat_v8i32_v8i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp ult <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %3 = trunc <8 x i32> %2 to <8 x i8>
- ret <8 x i8> %3
-}
-
-define void @trunc_usat_v8i32_v8i8_store(<8 x i32> %a0, <8 x i8> *%p1) {
-; SSE2-LABEL: trunc_usat_v8i32_v8i8_store:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm4
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm5, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm4, %xmm6
-; SSE2-NEXT: pand %xmm6, %xmm1
-; SSE2-NEXT: pandn %xmm2, %xmm6
-; SSE2-NEXT: por %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm0, %xmm3
-; SSE2-NEXT: pcmpgtd %xmm3, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm0
-; SSE2-NEXT: pandn %xmm2, %xmm5
-; SSE2-NEXT: por %xmm0, %xmm5
-; SSE2-NEXT: packuswb %xmm6, %xmm5
-; SSE2-NEXT: packuswb %xmm0, %xmm5
-; SSE2-NEXT: movq %xmm5, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v8i32_v8i8_store:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm0, %xmm4
-; SSSE3-NEXT: pxor %xmm3, %xmm4
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm5, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm4, %xmm6
-; SSSE3-NEXT: pand %xmm6, %xmm0
-; SSSE3-NEXT: pandn %xmm2, %xmm6
-; SSSE3-NEXT: por %xmm0, %xmm6
-; SSSE3-NEXT: pxor %xmm1, %xmm3
-; SSSE3-NEXT: pcmpgtd %xmm3, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm1
-; SSSE3-NEXT: pandn %xmm2, %xmm5
-; SSSE3-NEXT: por %xmm1, %xmm5
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm0, %xmm5
-; SSSE3-NEXT: pshufb %xmm0, %xmm6
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
-; SSSE3-NEXT: movq %xmm6, (%rdi)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v8i32_v8i8_store:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255]
-; SSE41-NEXT: pminud %xmm2, %xmm0
-; SSE41-NEXT: pminud %xmm2, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSE41-NEXT: pshufb %xmm2, %xmm1
-; SSE41-NEXT: pshufb %xmm2, %xmm0
-; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE41-NEXT: movq %xmm0, (%rdi)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v8i32_v8i8_store:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [255,255,255,255]
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX1-NEXT: vmovq %xmm0, (%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v8i32_v8i8_store:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_usat_v8i32_v8i8_store:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, (%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_usat_v8i32_v8i8_store:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovusdb %ymm0, (%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_usat_v8i32_v8i8_store:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpbroadcastd {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255]
-; AVX512BW-NEXT: vpminud %ymm1, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, (%rdi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_usat_v8i32_v8i8_store:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovusdb %ymm0, (%rdi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp ult <8 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %2 = select <8 x i1> %1, <8 x i32> %a0, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %3 = trunc <8 x i32> %2 to <8 x i8>
- store <8 x i8> %3, <8 x i8> *%p1
- ret void
-}
-
-define <16 x i8> @trunc_usat_v16i32_v16i8(<16 x i32> %a0) {
-; SSE2-LABEL: trunc_usat_v16i32_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255]
-; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT: movdqa %xmm1, %xmm7
-; SSE2-NEXT: pxor %xmm6, %xmm7
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm1, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm6, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm7
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm7
-; SSE2-NEXT: pand %xmm7, %xmm0
-; SSE2-NEXT: pandn %xmm8, %xmm7
-; SSE2-NEXT: por %xmm7, %xmm0
-; SSE2-NEXT: packuswb %xmm4, %xmm0
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm6, %xmm1
-; SSE2-NEXT: movdqa %xmm5, %xmm4
-; SSE2-NEXT: pcmpgtd %xmm1, %xmm4
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pandn %xmm8, %xmm4
-; SSE2-NEXT: por %xmm3, %xmm4
-; SSE2-NEXT: pxor %xmm2, %xmm6
-; SSE2-NEXT: pcmpgtd %xmm6, %xmm5
-; SSE2-NEXT: pand %xmm5, %xmm2
-; SSE2-NEXT: pandn %xmm8, %xmm5
-; SSE2-NEXT: por %xmm2, %xmm5
-; SSE2-NEXT: packuswb %xmm4, %xmm5
-; SSE2-NEXT: packuswb %xmm5, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v16i32_v16i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT: movdqa %xmm1, %xmm7
-; SSSE3-NEXT: pxor %xmm6, %xmm7
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [2147483903,2147483903,2147483903,2147483903]
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm7, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm1, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm6, %xmm1
-; SSSE3-NEXT: movdqa %xmm5, %xmm7
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm7
-; SSSE3-NEXT: pand %xmm7, %xmm0
-; SSSE3-NEXT: pandn %xmm8, %xmm7
-; SSSE3-NEXT: por %xmm7, %xmm0
-; SSSE3-NEXT: packuswb %xmm4, %xmm0
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm6, %xmm1
-; SSSE3-NEXT: movdqa %xmm5, %xmm4
-; SSSE3-NEXT: pcmpgtd %xmm1, %xmm4
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pandn %xmm8, %xmm4
-; SSSE3-NEXT: por %xmm3, %xmm4
-; SSSE3-NEXT: pxor %xmm2, %xmm6
-; SSSE3-NEXT: pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT: pand %xmm5, %xmm2
-; SSSE3-NEXT: pandn %xmm8, %xmm5
-; SSSE3-NEXT: por %xmm2, %xmm5
-; SSSE3-NEXT: packuswb %xmm4, %xmm5
-; SSSE3-NEXT: packuswb %xmm5, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v16i32_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255]
-; SSE41-NEXT: pminud %xmm4, %xmm1
-; SSE41-NEXT: pminud %xmm4, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: pminud %xmm4, %xmm3
-; SSE41-NEXT: pminud %xmm4, %xmm2
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: packuswb %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v16i32_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255]
-; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpminud %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v16i32_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpminud %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpminud %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc_usat_v16i32_v16i8:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovusdb %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
- %1 = icmp ult <16 x i32> %a0, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %2 = select <16 x i1> %1, <16 x i32> %a0, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
- %3 = trunc <16 x i32> %2 to <16 x i8>
- ret <16 x i8> %3
-}
-
-define <16 x i8> @trunc_usat_v16i16_v16i8(<16 x i16> %a0) {
-; SSE2-LABEL: trunc_usat_v16i16_v16i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023]
-; SSE2-NEXT: pminsw %xmm3, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: pminsw %xmm3, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v16i16_v16i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [33023,33023,33023,33023,33023,33023,33023,33023]
-; SSSE3-NEXT: pminsw %xmm3, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: pminsw %xmm3, %xmm0
-; SSSE3-NEXT: pxor %xmm2, %xmm0
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v16i16_v16i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pminuw %xmm2, %xmm1
-; SSE41-NEXT: pminuw %xmm2, %xmm0
-; SSE41-NEXT: packuswb %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v16i16_v16i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpminuw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpminuw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v16i16_v16i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_usat_v16i16_v16i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_usat_v16i16_v16i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_usat_v16i16_v16i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpminuw {{.*}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_usat_v16i16_v16i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovuswb %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = icmp ult <16 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %2 = select <16 x i1> %1, <16 x i16> %a0, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %3 = trunc <16 x i16> %2 to <16 x i8>
- ret <16 x i8> %3
-}
-
-define <32 x i8> @trunc_usat_v32i16_v32i8(<32 x i16> %a0) {
-; SSE2-LABEL: trunc_usat_v32i16_v32i8:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [33023,33023,33023,33023,33023,33023,33023,33023]
-; SSE2-NEXT: pminsw %xmm5, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: pminsw %xmm5, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pminsw %xmm5, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: pminsw %xmm5, %xmm0
-; SSE2-NEXT: pxor %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: movdqa %xmm2, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc_usat_v32i16_v32i8:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: movdqa {{.*#+}} xmm5 = [33023,33023,33023,33023,33023,33023,33023,33023]
-; SSSE3-NEXT: pminsw %xmm5, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: pminsw %xmm5, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm2
-; SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pminsw %xmm5, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: pminsw %xmm5, %xmm0
-; SSSE3-NEXT: pxor %xmm4, %xmm0
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: movdqa %xmm2, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc_usat_v32i16_v32i8:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; SSE41-NEXT: pminuw %xmm4, %xmm3
-; SSE41-NEXT: pminuw %xmm4, %xmm2
-; SSE41-NEXT: packuswb %xmm3, %xmm2
-; SSE41-NEXT: pminuw %xmm4, %xmm1
-; SSE41-NEXT: pminuw %xmm4, %xmm0
-; SSE41-NEXT: packuswb %xmm1, %xmm0
-; SSE41-NEXT: movdqa %xmm2, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc_usat_v32i16_v32i8:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vpminuw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpminuw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpminuw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpminuw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc_usat_v32i16_v32i8:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpminuw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpminuw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc_usat_v32i16_v32i8:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512F-NEXT: vpminuw %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpminuw %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc_usat_v32i16_v32i8:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX512VL-NEXT: vpminuw %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vpminuw %ymm2, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm1, %xmm1
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc_usat_v32i16_v32i8:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovuswb %zmm0, %ymm0
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc_usat_v32i16_v32i8:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovuswb %zmm0, %ymm0
-; AVX512BWVL-NEXT: retq
- %1 = icmp ult <32 x i16> %a0, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %2 = select <32 x i1> %1, <32 x i16> %a0, <32 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
- %3 = trunc <32 x i16> %2 to <32 x i8>
- ret <32 x i8> %3
-}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
-
-define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) {
-; SSE-LABEL: trunc8i64_8i32:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
-; SSE-NEXT: movaps %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc8i64_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc8i64_8i32:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc8i64_8i32:
-; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc8i64_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: retq
-entry:
- %0 = trunc <8 x i64> %a to <8 x i32>
- ret <8 x i32> %0
-}
-
-define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) {
-; SSE-LABEL: trunc8i64_8i32_ashr:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; SSE-NEXT: movaps %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc8i64_8i32_ashr:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc8i64_8i32_ashr:
-; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [1,3,5,7,5,7,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc8i64_8i32_ashr:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpsraq $32, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: retq
-entry:
- %0 = ashr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
- %1 = trunc <8 x i64> %0 to <8 x i32>
- ret <8 x i32> %1
-}
-
-define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) {
-; SSE-LABEL: trunc8i64_8i32_lshr:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
-; SSE-NEXT: movaps %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc8i64_8i32_lshr:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc8i64_8i32_lshr:
-; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vpsrlq $32, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpsrlq $32, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc8i64_8i32_lshr:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm0
-; AVX512-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512-NEXT: retq
-entry:
- %0 = lshr <8 x i64> %a, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
- %1 = trunc <8 x i64> %0 to <8 x i32>
- ret <8 x i32> %1
-}
-
-define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
-; SSE2-LABEL: trunc8i64_8i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc8i64_8i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc8i64_8i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: packusdw %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc8i64_8i16:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc8i64_8i16:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc8i64_8i16:
-; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-FAST-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512-LABEL: trunc8i64_8i16:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-entry:
- %0 = trunc <8 x i64> %a to <8 x i16>
- ret <8 x i16> %0
-}
-
-define void @trunc8i64_8i8(<8 x i64> %a) {
-; SSE2-LABEL: trunc8i64_8i8:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movq %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc8i64_8i8:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: packuswb %xmm0, %xmm0
-; SSSE3-NEXT: movq %xmm0, (%rax)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc8i64_8i8:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; SSE41-NEXT: pand %xmm4, %xmm3
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: pand %xmm4, %xmm1
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: packusdw %xmm2, %xmm0
-; SSE41-NEXT: packuswb %xmm0, %xmm0
-; SSE41-NEXT: movq %xmm0, (%rax)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc8i64_8i8:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vmovq %xmm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc8i64_8i8:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc8i64_8i8:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovqb %zmm0, (%rax)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-entry:
- %0 = trunc <8 x i64> %a to <8 x i8>
- store <8 x i8> %0, <8 x i8>* undef, align 4
- ret void
-}
-
-define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
-; SSE2-LABEL: trunc8i32_8i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc8i32_8i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc8i32_8i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm2, %xmm1
-; SSE41-NEXT: pshufb %xmm2, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc8i32_8i16:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc8i32_8i16:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc8i32_8i16:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc8i32_8i16:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc8i32_8i16:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc8i32_8i16:
-; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-entry:
- %0 = trunc <8 x i32> %a to <8 x i16>
- ret <8 x i16> %0
-}
-
-define <8 x i16> @trunc8i32_8i16_ashr(<8 x i32> %a) {
-; SSE-LABEL: trunc8i32_8i16_ashr:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc8i32_8i16_ashr:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc8i32_8i16_ashr:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc8i32_8i16_ashr:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpsrad $16, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc8i32_8i16_ashr:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpsrad $16, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc8i32_8i16_ashr:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpsrad $16, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc8i32_8i16_ashr:
-; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpsrad $16, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-entry:
- %0 = ashr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
- %1 = trunc <8 x i32> %0 to <8 x i16>
- ret <8 x i16> %1
-}
-
-define <8 x i16> @trunc8i32_8i16_lshr(<8 x i32> %a) {
-; SSE2-LABEL: trunc8i32_8i16_lshr:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc8i32_8i16_lshr:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,10,11,14,15,14,15,255,255]
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc8i32_8i16_lshr:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc8i32_8i16_lshr:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc8i32_8i16_lshr:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc8i32_8i16_lshr:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc8i32_8i16_lshr:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc8i32_8i16_lshr:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc8i32_8i16_lshr:
-; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-entry:
- %0 = lshr <8 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
- %1 = trunc <8 x i32> %0 to <8 x i16>
- ret <8 x i16> %1
-}
-
-define void @trunc8i32_8i8(<8 x i32> %a) {
-; SSE2-LABEL: trunc8i32_8i8:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movq %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc8i32_8i8:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movq %xmm0, (%rax)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc8i32_8i8:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; SSE41-NEXT: pshufb %xmm2, %xmm1
-; SSE41-NEXT: pshufb %xmm2, %xmm0
-; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE41-NEXT: movq %xmm0, (%rax)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc8i32_8i8:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT: vmovq %xmm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc8i32_8i8:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT: vmovq %xmm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc8i32_8i8:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, (%rax)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc8i32_8i8:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpmovdb %ymm0, (%rax)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc8i32_8i8:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512BW-NEXT: vmovq %xmm0, (%rax)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc8i32_8i8:
-; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpmovdb %ymm0, (%rax)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-entry:
- %0 = trunc <8 x i32> %a to <8 x i8>
- store <8 x i8> %0, <8 x i8>* undef, align 4
- ret void
-}
-
-define void @trunc16i32_16i16(<16 x i32> %a) {
-; SSE2-LABEL: trunc16i32_16i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: pslld $16, %xmm2
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: movdqu %xmm2, (%rax)
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc16i32_16i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pslld $16, %xmm1
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: pslld $16, %xmm0
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSSE3-NEXT: pslld $16, %xmm3
-; SSSE3-NEXT: psrad $16, %xmm3
-; SSSE3-NEXT: pslld $16, %xmm2
-; SSSE3-NEXT: psrad $16, %xmm2
-; SSSE3-NEXT: packssdw %xmm3, %xmm2
-; SSSE3-NEXT: movdqu %xmm2, (%rax)
-; SSSE3-NEXT: movdqu %xmm0, (%rax)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc16i32_16i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pxor %xmm4, %xmm4
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7]
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3],xmm3[4],xmm4[5],xmm3[6],xmm4[7]
-; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7]
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: movdqu %xmm2, (%rax)
-; SSE41-NEXT: movdqu %xmm0, (%rax)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc16i32_16i16:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqu %xmm1, (%rax)
-; AVX1-NEXT: vmovdqu %xmm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc16i32_16i16:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqu %xmm1, (%rax)
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc16i32_16i16:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovdw %zmm0, (%rax)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-entry:
- %0 = trunc <16 x i32> %a to <16 x i16>
- store <16 x i16> %0, <16 x i16>* undef, align 4
- ret void
-}
-
-define void @trunc16i32_16i16_ashr(<16 x i32> %a) {
-; SSE-LABEL: trunc16i32_16i16_ashr:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: psrad $16, %xmm3
-; SSE-NEXT: psrad $16, %xmm2
-; SSE-NEXT: packssdw %xmm3, %xmm2
-; SSE-NEXT: psrad $16, %xmm1
-; SSE-NEXT: psrad $16, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: movdqu %xmm2, (%rax)
-; SSE-NEXT: movdqu %xmm0, (%rax)
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc16i32_16i16_ashr:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %xmm1, (%rax)
-; AVX1-NEXT: vmovdqu %xmm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc16i32_16i16_ashr:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1
-; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
-; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc16i32_16i16_ashr:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdw %zmm0, (%rax)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-entry:
- %0 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
- %1 = trunc <16 x i32> %0 to <16 x i16>
- store <16 x i16> %1, <16 x i16>* undef, align 4
- ret void
-}
-
-define void @trunc16i32_16i16_lshr(<16 x i32> %a) {
-; SSE2-LABEL: trunc16i32_16i16_lshr:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: movdqu %xmm2, (%rax)
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc16i32_16i16_lshr:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: psrad $16, %xmm1
-; SSSE3-NEXT: psrad $16, %xmm0
-; SSSE3-NEXT: packssdw %xmm1, %xmm0
-; SSSE3-NEXT: psrad $16, %xmm3
-; SSSE3-NEXT: psrad $16, %xmm2
-; SSSE3-NEXT: packssdw %xmm3, %xmm2
-; SSSE3-NEXT: movdqu %xmm2, (%rax)
-; SSSE3-NEXT: movdqu %xmm0, (%rax)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc16i32_16i16_lshr:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: psrld $16, %xmm3
-; SSE41-NEXT: psrld $16, %xmm2
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: movdqu %xmm2, (%rax)
-; SSE41-NEXT: movdqu %xmm0, (%rax)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc16i32_16i16_lshr:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $16, %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %xmm1, (%rax)
-; AVX1-NEXT: vmovdqu %xmm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc16i32_16i16_lshr:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc16i32_16i16_lshr:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdw %zmm0, (%rax)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-entry:
- %0 = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
- %1 = trunc <16 x i32> %0 to <16 x i16>
- store <16 x i16> %1, <16 x i16>* undef, align 4
- ret void
-}
-
-define void @trunc16i32_16i8(<16 x i32> %a) {
-; SSE2-LABEL: trunc16i32_16i8:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc16i32_16i8:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: movdqu %xmm0, (%rax)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc16i32_16i8:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE41-NEXT: pand %xmm4, %xmm3
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: pand %xmm4, %xmm1
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: packuswb %xmm2, %xmm0
-; SSE41-NEXT: movdqu %xmm0, (%rax)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc16i32_16i8:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc16i32_16i8:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc16i32_16i8:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovdb %zmm0, (%rax)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-entry:
- %0 = trunc <16 x i32> %a to <16 x i8>
- store <16 x i8> %0, <16 x i8>* undef, align 4
- ret void
-}
-
-define void @trunc16i32_16i8_ashr(<16 x i32> %a) {
-; SSE-LABEL: trunc16i32_16i8_ashr:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: psrad $24, %xmm1
-; SSE-NEXT: psrad $24, %xmm0
-; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: psrad $24, %xmm3
-; SSE-NEXT: psrad $24, %xmm2
-; SSE-NEXT: packssdw %xmm3, %xmm2
-; SSE-NEXT: packsswb %xmm2, %xmm0
-; SSE-NEXT: movdqu %xmm0, (%rax)
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc16i32_16i8_ashr:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm0, %xmm0
-; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm2, %xmm2
-; AVX1-NEXT: vpsrad $24, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc16i32_16i8_ashr:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrad $24, %ymm1, %ymm1
-; AVX2-NEXT: vpsrad $24, %ymm0, %ymm0
-; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc16i32_16i8_ashr:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, (%rax)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-entry:
- %0 = ashr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
- %1 = trunc <16 x i32> %0 to <16 x i8>
- store <16 x i8> %1, <16 x i8>* undef, align 4
- ret void
-}
-
-define void @trunc16i32_16i8_lshr(<16 x i32> %a) {
-; SSE2-LABEL: trunc16i32_16i8_lshr:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: psrld $24, %xmm1
-; SSE2-NEXT: psrld $24, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: psrld $24, %xmm3
-; SSE2-NEXT: psrld $24, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: packuswb %xmm2, %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc16i32_16i8_lshr:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: psrld $24, %xmm1
-; SSSE3-NEXT: psrld $24, %xmm0
-; SSSE3-NEXT: packuswb %xmm1, %xmm0
-; SSSE3-NEXT: psrld $24, %xmm3
-; SSSE3-NEXT: psrld $24, %xmm2
-; SSSE3-NEXT: packuswb %xmm3, %xmm2
-; SSSE3-NEXT: packuswb %xmm2, %xmm0
-; SSSE3-NEXT: movdqu %xmm0, (%rax)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc16i32_16i8_lshr:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: psrld $24, %xmm1
-; SSE41-NEXT: psrld $24, %xmm0
-; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: psrld $24, %xmm3
-; SSE41-NEXT: psrld $24, %xmm2
-; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: packuswb %xmm2, %xmm0
-; SSE41-NEXT: movdqu %xmm0, (%rax)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc16i32_16i8_lshr:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $24, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpsrld $24, %xmm2, %xmm2
-; AVX1-NEXT: vpsrld $24, %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc16i32_16i8_lshr:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrld $24, %ymm1, %ymm1
-; AVX2-NEXT: vpsrld $24, %ymm0, %ymm0
-; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: trunc16i32_16i8_lshr:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpsrld $24, %zmm0, %zmm0
-; AVX512-NEXT: vpmovdb %zmm0, (%rax)
-; AVX512-NEXT: vzeroupper
-; AVX512-NEXT: retq
-entry:
- %0 = lshr <16 x i32> %a, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
- %1 = trunc <16 x i32> %0 to <16 x i8>
- store <16 x i8> %1, <16 x i8>* undef, align 4
- ret void
-}
-
-;PR25684
-define void @trunc16i16_16i8(<16 x i16> %a) {
-; SSE2-LABEL: trunc16i16_16i8:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc16i16_16i8:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: movdqu %xmm0, (%rax)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc16i16_16i8:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE41-NEXT: pshufb %xmm2, %xmm1
-; SSE41-NEXT: pshufb %xmm2, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: movdqu %xmm0, (%rax)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc16i16_16i8:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc16i16_16i8:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc16i16_16i8:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, (%rax)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc16i16_16i8:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, (%rax)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc16i16_16i8:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc16i16_16i8:
-; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-entry:
- %0 = trunc <16 x i16> %a to <16 x i8>
- store <16 x i8> %0, <16 x i8>* undef, align 4
- ret void
-}
-
-define void @trunc16i16_16i8_ashr(<16 x i16> %a) {
-; SSE-LABEL: trunc16i16_16i8_ashr:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: psraw $8, %xmm1
-; SSE-NEXT: psraw $8, %xmm0
-; SSE-NEXT: packsswb %xmm1, %xmm0
-; SSE-NEXT: movdqu %xmm0, (%rax)
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc16i16_16i8_ashr:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc16i16_16i8_ashr:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsraw $8, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc16i16_16i8_ashr:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, (%rax)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc16i16_16i8_ashr:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, (%rax)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc16i16_16i8_ashr:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpsraw $8, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc16i16_16i8_ashr:
-; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-entry:
- %0 = ashr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
- %1 = trunc <16 x i16> %0 to <16 x i8>
- store <16 x i8> %1, <16 x i8>* undef, align 4
- ret void
-}
-
-define void @trunc16i16_16i8_lshr(<16 x i16> %a) {
-; SSE-LABEL: trunc16i16_16i8_lshr:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: psrlw $8, %xmm1
-; SSE-NEXT: psrlw $8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: movdqu %xmm0, (%rax)
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc16i16_16i8_lshr:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqu %xmm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc16i16_16i8_lshr:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqu %xmm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc16i16_16i8_lshr:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, (%rax)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc16i16_16i8_lshr:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, (%rax)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc16i16_16i8_lshr:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vmovdqu %xmm0, (%rax)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc16i16_16i8_lshr:
-; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rax)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-entry:
- %0 = lshr <16 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
- %1 = trunc <16 x i16> %0 to <16 x i8>
- store <16 x i8> %1, <16 x i8>* undef, align 4
- ret void
-}
-
-define void @trunc32i16_32i8(<32 x i16> %a) {
-; SSE2-LABEL: trunc32i16_32i8:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: packuswb %xmm3, %xmm2
-; SSE2-NEXT: movdqu %xmm2, (%rax)
-; SSE2-NEXT: movdqu %xmm0, (%rax)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc32i16_32i8:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm4, %xmm1
-; SSSE3-NEXT: pshufb %xmm4, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: pshufb %xmm4, %xmm3
-; SSSE3-NEXT: pshufb %xmm4, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSSE3-NEXT: movdqu %xmm2, (%rax)
-; SSSE3-NEXT: movdqu %xmm0, (%rax)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc32i16_32i8:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE41-NEXT: pshufb %xmm4, %xmm1
-; SSE41-NEXT: pshufb %xmm4, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: pshufb %xmm4, %xmm3
-; SSE41-NEXT: pshufb %xmm4, %xmm2
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE41-NEXT: movdqu %xmm2, (%rax)
-; SSE41-NEXT: movdqu %xmm0, (%rax)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc32i16_32i8:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vmovdqu %xmm1, (%rax)
-; AVX1-NEXT: vmovdqu %xmm0, (%rax)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: trunc32i16_32i8:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
-; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-NEXT: vpackuswb %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vmovdqu %ymm0, (%rax)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: trunc32i16_32i8:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpmovdb %zmm1, (%rax)
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, (%rax)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc32i16_32i8:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm1, (%rax)
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, (%rax)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc32i16_32i8:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpmovwb %zmm0, (%rax)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc32i16_32i8:
-; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpmovwb %zmm0, (%rax)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-entry:
- %0 = trunc <32 x i16> %a to <32 x i8>
- store <32 x i8> %0, <32 x i8>* undef, align 4
- ret void
-}
-
-define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: trunc2x4i64_8i32:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
-; SSE-NEXT: movaps %xmm2, %xmm1
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: trunc2x4i64_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc2x4i64_8i32:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc2x4i64_8i32:
-; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
-; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc2x4i64_8i32:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc2x4i64_8i32:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512VL-NEXT: vpmovqd %ymm1, %xmm1
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc2x4i64_8i32:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
-; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc2x4i64_8i32:
-; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpmovqd %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpmovqd %ymm1, %xmm1
-; AVX512BWVL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BWVL-NEXT: retq
-entry:
- %0 = trunc <4 x i64> %a to <4 x i32>
- %1 = trunc <4 x i64> %b to <4 x i32>
- %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x i32> %2
-}
-
-define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
-; SSE2-LABEL: trunc2x4i64_8i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc2x4i64_8i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[0,2,2,3,4,5,6,7]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc2x4i64_8i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,2,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; SSE41-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: trunc2x4i64_8i16:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: trunc2x4i64_8i16:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,2,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: trunc2x4i64_8i16:
-; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-FAST-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
-; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: trunc2x4i64_8i16:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc2x4i64_8i16:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc2x4i64_8i16:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
-; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc2x4i64_8i16:
-; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpmovqw %ymm1, %xmm1
-; AVX512BWVL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
-entry:
- %0 = trunc <4 x i64> %a to <4 x i16>
- %1 = trunc <4 x i64> %b to <4 x i16>
- %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x i16> %2
-}
-
-define <4 x i32> @trunc2x2i64_4i32(<2 x i64> %a, <2 x i64> %b) {
-; SSE-LABEL: trunc2x2i64_4i32:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: trunc2x2i64_4i32:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: trunc2x2i64_4i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
-; AVX512-NEXT: retq
-entry:
- %0 = trunc <2 x i64> %a to <2 x i32>
- %1 = trunc <2 x i64> %b to <2 x i32>
- %2 = shufflevector <2 x i32> %0, <2 x i32> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- ret <4 x i32> %2
-}
-
-define i64 @trunc2i64_i64(<2 x i64> %inval) {
-; SSE-LABEL: trunc2i64_i64:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT: movq %xmm0, %rax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: trunc2i64_i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: trunc2i64_i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: retq
-entry:
- %0 = trunc <2 x i64> %inval to <2 x i32>
- %1 = bitcast <2 x i32> %0 to i64
- ret i64 %1
-}
-
-define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
-; SSE2-LABEL: trunc2x4i32_8i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc2x4i32_8i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc2x4i32_8i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm2, %xmm1
-; SSE41-NEXT: pshufb %xmm2, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: trunc2x4i32_8i16:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: retq
-;
-; AVX512F-LABEL: trunc2x4i32_8i16:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: trunc2x4i32_8i16:
-; AVX512VL: # %bb.0: # %entry
-; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: trunc2x4i32_8i16:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: trunc2x4i32_8i16:
-; AVX512BWVL: # %bb.0: # %entry
-; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14]
-; AVX512BWVL-NEXT: vpermt2w %xmm1, %xmm2, %xmm0
-; AVX512BWVL-NEXT: retq
-entry:
- %0 = trunc <4 x i32> %a to <4 x i16>
- %1 = trunc <4 x i32> %b to <4 x i16>
- %2 = shufflevector <4 x i16> %0, <4 x i16> %1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- ret <8 x i16> %2
-}
-
-; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
-define i64 @trunc4i32_i64(<4 x i32> %inval) {
-; SSE2-LABEL: trunc4i32_i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc4i32_i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: movq %xmm0, %rax
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc4i32_i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: trunc4i32_i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: trunc4i32_i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: retq
-entry:
- %0 = trunc <4 x i32> %inval to <4 x i16>
- %1 = bitcast <4 x i16> %0 to i64
- ret i64 %1
-}
-
-define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
-; SSE2-LABEL: trunc2x8i16_16i8:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT: pand %xmm2, %xmm1
-; SSE2-NEXT: pand %xmm2, %xmm0
-; SSE2-NEXT: packuswb %xmm1, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc2x8i16_16i8:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT: pshufb %xmm2, %xmm1
-; SSSE3-NEXT: pshufb %xmm2, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc2x8i16_16i8:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE41-NEXT: pshufb %xmm2, %xmm1
-; SSE41-NEXT: pshufb %xmm2, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: trunc2x8i16_16i8:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: trunc2x8i16_16i8:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm2, %xmm1, %xmm1
-; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
-; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX512-NEXT: retq
-entry:
- %0 = trunc <8 x i16> %a to <8 x i8>
- %1 = trunc <8 x i16> %b to <8 x i8>
- %2 = shufflevector <8 x i8> %0, <8 x i8> %1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
- ret <16 x i8> %2
-}
-
-; PR15524 http://llvm.org/bugs/show_bug.cgi?id=15524
-define i64 @trunc8i16_i64(<8 x i16> %inval) {
-; SSE2-LABEL: trunc8i16_i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: packuswb %xmm0, %xmm0
-; SSE2-NEXT: movq %xmm0, %rax
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: trunc8i16_i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSSE3-NEXT: movq %xmm0, %rax
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: trunc8i16_i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE41-NEXT: movq %xmm0, %rax
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: trunc8i16_i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vmovq %xmm0, %rax
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: trunc8i16_i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vmovq %xmm0, %rax
-; AVX512-NEXT: retq
-entry:
- %0 = trunc <8 x i16> %inval to <8 x i8>
- %1 = bitcast <8 x i8> %0 to i64
- ret i64 %1
-}
-
-define <16 x i8> @trunc16i64_16i8_const() {
-; SSE-LABEL: trunc16i64_16i8_const:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: retq
-;
-; AVX-LABEL: trunc16i64_16i8_const:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
-;
-; AVX512-LABEL: trunc16i64_16i8_const:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT: retq
-
-entry:
- %0 = trunc <16 x i64> zeroinitializer to <16 x i8>
- %1 = shufflevector <16 x i8> %0, <16 x i8> %0, <16 x i32> <i32 28, i32 30, i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 undef, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26>
- ret <16 x i8> %1
-}
-
-define <8 x i16> @PR32160(<8 x i32> %x) {
-; SSE-LABEL: PR32160:
-; SSE: # %bb.0:
-; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,7]
-; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: PR32160:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: PR32160:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vzeroupper
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: PR32160:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,8,9,8,9,8,9,8,9,8,9,8,9,8,9]
-; AVX2-FAST-NEXT: vzeroupper
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: PR32160:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7]
-; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: PR32160:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: PR32160:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: PR32160:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovdw %ymm0, %xmm0
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5]
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %shuf = trunc <8 x i32> %x to <8 x i16>
- %trunc = shufflevector <8 x i16> %shuf, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
- ret <8 x i16> %trunc
-}
-
-define void @PR34773(i16* %a0, i8* %a1) {
-; SSE-LABEL: PR34773:
-; SSE: # %bb.0:
-; SSE-NEXT: movdqu (%rdi), %xmm0
-; SSE-NEXT: movdqu 16(%rdi), %xmm1
-; SSE-NEXT: movdqu 32(%rdi), %xmm2
-; SSE-NEXT: movdqu 48(%rdi), %xmm3
-; SSE-NEXT: psrlw $8, %xmm1
-; SSE-NEXT: psrlw $8, %xmm0
-; SSE-NEXT: packuswb %xmm1, %xmm0
-; SSE-NEXT: psrlw $8, %xmm3
-; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: packuswb %xmm3, %xmm2
-; SSE-NEXT: movdqu %xmm0, (%rsi)
-; SSE-NEXT: movdqu %xmm2, 16(%rsi)
-; SSE-NEXT: retq
-;
-; AVX1-LABEL: PR34773:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqu (%rdi), %xmm0
-; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
-; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2
-; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vmovdqu %xmm0, (%rsi)
-; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi)
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: PR34773:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqu %xmm0, (%rsi)
-; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: PR34773:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1
-; AVX512F-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, (%rsi)
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512F-NEXT: vpmovdb %zmm0, 16(%rsi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: PR34773:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512VL-NEXT: vmovdqu 32(%rdi), %ymm1
-; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, (%rsi)
-; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
-; AVX512VL-NEXT: vpmovdb %zmm0, 16(%rsi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: PR34773:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512BW-NEXT: vmovdqu 32(%rdi), %ymm1
-; AVX512BW-NEXT: vpsrlw $8, %ymm0, %ymm0
-; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi)
-; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: PR34773:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpsrlw $8, (%rdi), %ymm0
-; AVX512BWVL-NEXT: vpsrlw $8, 32(%rdi), %ymm1
-; AVX512BWVL-NEXT: vpmovwb %ymm0, (%rsi)
-; AVX512BWVL-NEXT: vpmovwb %ymm1, 16(%rsi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %1 = getelementptr i16, i16* %a0, i64 16
- %2 = getelementptr i8, i8* %a1, i64 16
- %3 = bitcast i16* %a0 to <16 x i16>*
- %4 = bitcast i16* %1 to <16 x i16>*
- %5 = bitcast i8* %a1 to <16 x i8>*
- %6 = bitcast i8* %2 to <16 x i8>*
- %7 = load <16 x i16>, <16 x i16>* %3, align 2
- %8 = load <16 x i16>, <16 x i16>* %4, align 2
- %9 = lshr <16 x i16> %7, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
- %10 = lshr <16 x i16> %8, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
- %11 = trunc <16 x i16> %9 to <16 x i8>
- %12 = trunc <16 x i16> %10 to <16 x i8>
- store <16 x i8> %11, <16 x i8>* %5, align 1
- store <16 x i8> %12, <16 x i8>* %6, align 1
- ret void
-}
-
-; Store merging must not infinitely fight store splitting.
-
-define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, <8 x i16>* %p) align 2 {
-; SSE2-LABEL: store_merge_split:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pslld $16, %xmm1
-; SSE2-NEXT: psrad $16, %xmm1
-; SSE2-NEXT: pslld $16, %xmm0
-; SSE2-NEXT: psrad $16, %xmm0
-; SSE2-NEXT: packssdw %xmm1, %xmm0
-; SSE2-NEXT: pslld $16, %xmm3
-; SSE2-NEXT: psrad $16, %xmm3
-; SSE2-NEXT: pslld $16, %xmm2
-; SSE2-NEXT: psrad $16, %xmm2
-; SSE2-NEXT: packssdw %xmm3, %xmm2
-; SSE2-NEXT: shlq $4, %rdi
-; SSE2-NEXT: movdqu %xmm0, (%rsi,%rdi)
-; SSE2-NEXT: movdqu %xmm2, 16(%rsi,%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: store_merge_split:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT: pshufb %xmm4, %xmm1
-; SSSE3-NEXT: pshufb %xmm4, %xmm0
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSSE3-NEXT: pshufb %xmm4, %xmm3
-; SSSE3-NEXT: pshufb %xmm4, %xmm2
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSSE3-NEXT: shlq $4, %rdi
-; SSSE3-NEXT: movdqu %xmm0, (%rsi,%rdi)
-; SSSE3-NEXT: movdqu %xmm2, 16(%rsi,%rdi)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: store_merge_split:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE41-NEXT: pshufb %xmm4, %xmm1
-; SSE41-NEXT: pshufb %xmm4, %xmm0
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE41-NEXT: pshufb %xmm4, %xmm3
-; SSE41-NEXT: pshufb %xmm4, %xmm2
-; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE41-NEXT: shlq $4, %rdi
-; SSE41-NEXT: movdqu %xmm0, (%rsi,%rdi)
-; SSE41-NEXT: movdqu %xmm2, 16(%rsi,%rdi)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: store_merge_split:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT: shlq $4, %rdi
-; AVX1-NEXT: vmovdqu %xmm0, (%rsi,%rdi)
-; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi)
-; AVX1-NEXT: vzeroupper
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: store_merge_split:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT: shlq $4, %rdi
-; AVX2-NEXT: vmovdqu %xmm0, (%rsi,%rdi)
-; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi)
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: store_merge_split:
-; AVX512F: # %bb.0:
-; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512F-NEXT: shlq $4, %rdi
-; AVX512F-NEXT: vmovdqu %xmm0, (%rsi,%rdi)
-; AVX512F-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi)
-; AVX512F-NEXT: vzeroupper
-; AVX512F-NEXT: retq
-;
-; AVX512VL-LABEL: store_merge_split:
-; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: shlq $4, %rdi
-; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi,%rdi)
-; AVX512VL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi)
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512BW-LABEL: store_merge_split:
-; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
-; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1
-; AVX512BW-NEXT: shlq $4, %rdi
-; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi,%rdi)
-; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi)
-; AVX512BW-NEXT: vzeroupper
-; AVX512BW-NEXT: retq
-;
-; AVX512BWVL-LABEL: store_merge_split:
-; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: shlq $4, %rdi
-; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi,%rdi)
-; AVX512BWVL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi)
-; AVX512BWVL-NEXT: vzeroupper
-; AVX512BWVL-NEXT: retq
- %t1 = trunc <8 x i32> %w1 to <8 x i16>
- %t2 = trunc <8 x i32> %w2 to <8 x i16>
- %g1 = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 %idx
- %g2 = getelementptr inbounds <8 x i16>, <8 x i16>* %g1, i64 1
- store <8 x i16> %t1, <8 x i16>* %g1, align 2
- store <8 x i16> %t2, <8 x i16>* %g2, align 2
- ret void
-}
store <16 x i8> %12, <16 x i8>* %6, align 1
ret void
}
+
+; Store merging must not infinitely fight store splitting.
+
+define void @store_merge_split(<8 x i32> %w1, <8 x i32> %w2, i64 %idx, <8 x i16>* %p) align 2 {
+; SSE2-LABEL: store_merge_split:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pslld $16, %xmm1
+; SSE2-NEXT: psrad $16, %xmm1
+; SSE2-NEXT: pslld $16, %xmm0
+; SSE2-NEXT: psrad $16, %xmm0
+; SSE2-NEXT: packssdw %xmm1, %xmm0
+; SSE2-NEXT: pslld $16, %xmm3
+; SSE2-NEXT: psrad $16, %xmm3
+; SSE2-NEXT: pslld $16, %xmm2
+; SSE2-NEXT: psrad $16, %xmm2
+; SSE2-NEXT: packssdw %xmm3, %xmm2
+; SSE2-NEXT: shlq $4, %rdi
+; SSE2-NEXT: movdqu %xmm0, (%rsi,%rdi)
+; SSE2-NEXT: movdqu %xmm2, 16(%rsi,%rdi)
+; SSE2-NEXT: retq
+;
+; SSSE3-LABEL: store_merge_split:
+; SSSE3: # %bb.0:
+; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT: pshufb %xmm4, %xmm1
+; SSSE3-NEXT: pshufb %xmm4, %xmm0
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT: pshufb %xmm4, %xmm3
+; SSSE3-NEXT: pshufb %xmm4, %xmm2
+; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSSE3-NEXT: shlq $4, %rdi
+; SSSE3-NEXT: movdqu %xmm0, (%rsi,%rdi)
+; SSSE3-NEXT: movdqu %xmm2, 16(%rsi,%rdi)
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: store_merge_split:
+; SSE41: # %bb.0:
+; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE41-NEXT: pshufb %xmm4, %xmm1
+; SSE41-NEXT: pshufb %xmm4, %xmm0
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE41-NEXT: pshufb %xmm4, %xmm3
+; SSE41-NEXT: pshufb %xmm4, %xmm2
+; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE41-NEXT: shlq $4, %rdi
+; SSE41-NEXT: movdqu %xmm0, (%rsi,%rdi)
+; SSE41-NEXT: movdqu %xmm2, 16(%rsi,%rdi)
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: store_merge_split:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX1-NEXT: shlq $4, %rdi
+; AVX1-NEXT: vmovdqu %xmm0, (%rsi,%rdi)
+; AVX1-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi)
+; AVX1-NEXT: vzeroupper
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: store_merge_split:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
+; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
+; AVX2-NEXT: shlq $4, %rdi
+; AVX2-NEXT: vmovdqu %xmm0, (%rsi,%rdi)
+; AVX2-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi)
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: store_merge_split:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512F-NEXT: shlq $4, %rdi
+; AVX512F-NEXT: vmovdqu %xmm0, (%rsi,%rdi)
+; AVX512F-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi)
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: store_merge_split:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: shlq $4, %rdi
+; AVX512VL-NEXT: vpmovdw %ymm0, (%rsi,%rdi)
+; AVX512VL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: store_merge_split:
+; AVX512BW: # %bb.0:
+; AVX512BW-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512BW-NEXT: shlq $4, %rdi
+; AVX512BW-NEXT: vmovdqu %xmm0, (%rsi,%rdi)
+; AVX512BW-NEXT: vmovdqu %xmm1, 16(%rsi,%rdi)
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512BWVL-LABEL: store_merge_split:
+; AVX512BWVL: # %bb.0:
+; AVX512BWVL-NEXT: shlq $4, %rdi
+; AVX512BWVL-NEXT: vpmovdw %ymm0, (%rsi,%rdi)
+; AVX512BWVL-NEXT: vpmovdw %ymm1, 16(%rsi,%rdi)
+; AVX512BWVL-NEXT: vzeroupper
+; AVX512BWVL-NEXT: retq
+ %t1 = trunc <8 x i32> %w1 to <8 x i16>
+ %t2 = trunc <8 x i32> %w2 to <8 x i16>
+ %g1 = getelementptr inbounds <8 x i16>, <8 x i16>* %p, i64 %idx
+ %g2 = getelementptr inbounds <8 x i16>, <8 x i16>* %g1, i64 1
+ store <8 x i16> %t1, <8 x i16>* %g1, align 2
+ store <8 x i16> %t2, <8 x i16>* %g2, align 2
+ ret void
+}
+++ /dev/null
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-SLOW
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 --check-prefix=AVX2-FAST
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+fast-variable-shuffle | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
-
-define <8 x i16> @zext_16i8_to_8i16(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_16i8_to_8i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_16i8_to_8i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_16i8_to_8i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: zext_16i8_to_8i16:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX-NEXT: retq
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %C = zext <8 x i8> %B to <8 x i16>
- ret <8 x i16> %C
-}
-
-; PR17654
-define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
-; SSE2-LABEL: zext_16i8_to_16i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_16i8_to_16i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_16i8_to_16i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_16i8_to_16i16:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_16i8_to_16i16:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_16i8_to_16i16:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT: retq
-entry:
- %B = zext <16 x i8> %A to <16 x i16>
- ret <16 x i16> %B
-}
-
-define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
-; SSE2-LABEL: zext_32i8_to_32i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_32i8_to_32i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_32i8_to_32i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_32i8_to_32i16:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_32i8_to_32i16:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: vmovdqa %ymm2, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512F-LABEL: zext_32i8_to_32i16:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512F-NEXT: vmovdqa %ymm2, %ymm0
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: zext_32i8_to_32i16:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
-; AVX512BW-NEXT: retq
-entry:
- %B = zext <32 x i8> %A to <32 x i16>
- ret <32 x i16> %B
-}
-
-define <4 x i32> @zext_16i8_to_4i32(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_16i8_to_4i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_16i8_to_4i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_16i8_to_4i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: zext_16i8_to_4i32:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX-NEXT: retq
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %C = zext <4 x i8> %B to <4 x i32>
- ret <4 x i32> %C
-}
-
-define <8 x i32> @zext_16i8_to_8i32(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_16i8_to_8i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_16i8_to_8i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_16i8_to_8i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_16i8_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_16i8_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_16i8_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %C = zext <8 x i8> %B to <8 x i32>
- ret <8 x i32> %C
-}
-
-define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_16i8_to_16i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_16i8_to_16i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_16i8_to_16i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_16i8_to_16i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_16i8_to_16i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vmovdqa %ymm2, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_16i8_to_16i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %B = zext <16 x i8> %A to <16 x i32>
- ret <16 x i32> %B
-}
-
-define <2 x i64> @zext_16i8_to_2i64(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_16i8_to_2i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_16i8_to_2i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_16i8_to_2i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: zext_16i8_to_2i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: retq
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
- %C = zext <2 x i8> %B to <2 x i64>
- ret <2 x i64> %C
-}
-
-define <4 x i64> @zext_16i8_to_4i64(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_16i8_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_16i8_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_16i8_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psrld $16, %xmm0
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_16i8_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_16i8_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_16i8_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %C = zext <4 x i8> %B to <4 x i64>
- ret <4 x i64> %C
-}
-
-define <8 x i64> @zext_16i8_to_8i64(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_16i8_to_8i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_16i8_to_8i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_16i8_to_8i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrld $16, %xmm1
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psrlq $48, %xmm0
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_16i8_to_8i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_16i8_to_8i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vmovdqa %ymm2, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_16i8_to_8i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %C = zext <8 x i8> %B to <8 x i64>
- ret <8 x i64> %C
-}
-
-define <4 x i32> @zext_8i16_to_4i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_8i16_to_4i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_8i16_to_4i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_8i16_to_4i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: zext_8i16_to_4i32:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX-NEXT: retq
-entry:
- %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %C = zext <4 x i16> %B to <4 x i32>
- ret <4 x i32> %C
-}
-
-define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_8i16_to_8i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_8i16_to_8i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_8i16_to_8i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_8i16_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_8i16_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_8i16_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: retq
-entry:
- %B = zext <8 x i16> %A to <8 x i32>
- ret <8 x i32>%B
-}
-
-define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_16i16_to_16i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm1, %xmm3
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_16i16_to_16i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm1, %xmm3
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_16i16_to_16i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_16i16_to_16i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_16i16_to_16i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vmovdqa %ymm2, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_16i16_to_16i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512-NEXT: retq
-entry:
- %B = zext <16 x i16> %A to <16 x i32>
- ret <16 x i32> %B
-}
-
-define <2 x i64> @zext_8i16_to_2i64(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_8i16_to_2i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_8i16_to_2i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_8i16_to_2i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: zext_8i16_to_2i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX-NEXT: retq
-entry:
- %B = shufflevector <8 x i16> %A, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
- %C = zext <2 x i16> %B to <2 x i64>
- ret <2 x i64> %C
-}
-
-define <4 x i64> @zext_8i16_to_4i64(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_8i16_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_8i16_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_8i16_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_8i16_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_8i16_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_8i16_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %B = shufflevector <8 x i16> %A, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- %C = zext <4 x i16> %B to <4 x i64>
- ret <4 x i64> %C
-}
-
-define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_8i16_to_8i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_8i16_to_8i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: pxor %xmm4, %xmm4
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_8i16_to_8i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: movdqa %xmm4, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_8i16_to_8i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_8i16_to_8i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: vmovdqa %ymm2, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_8i16_to_8i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %B = zext <8 x i16> %A to <8 x i64>
- ret <8 x i64> %B
-}
-
-define <2 x i64> @zext_4i32_to_2i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_4i32_to_2i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_4i32_to_2i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_4i32_to_2i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: zext_4i32_to_2i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT: retq
-entry:
- %B = shufflevector <4 x i32> %A, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
- %C = zext <2 x i32> %B to <2 x i64>
- ret <2 x i64> %C
-}
-
-define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_4i32_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_4i32_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movaps %xmm0, %xmm1
-; SSSE3-NEXT: xorps %xmm2, %xmm2
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_4i32_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_4i32_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_4i32_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_4i32_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: retq
-entry:
- %B = zext <4 x i32> %A to <4 x i64>
- ret <4 x i64>%B
-}
-
-define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_8i32_to_8i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movaps %xmm1, %xmm3
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: xorps %xmm4, %xmm4
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: movaps %xmm3, %xmm2
-; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_8i32_to_8i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movaps %xmm1, %xmm3
-; SSSE3-NEXT: movaps %xmm0, %xmm1
-; SSSE3-NEXT: xorps %xmm4, %xmm4
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSSE3-NEXT: movaps %xmm3, %xmm2
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSSE3-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_8i32_to_8i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_8i32_to_8i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
-; AVX1-NEXT: vmovaps %ymm2, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_8i32_to_8i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vmovdqa %ymm2, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_8i32_to_8i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
-; AVX512-NEXT: retq
-entry:
- %B = zext <8 x i32> %A to <8 x i64>
- ret <8 x i64>%B
-}
-
-define <2 x i64> @load_zext_2i8_to_2i64(<2 x i8> *%ptr) {
-; SSE2-LABEL: load_zext_2i8_to_2i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movzwl (%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_2i8_to_2i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movzwl (%rdi), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_2i8_to_2i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: load_zext_2i8_to_2i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT: retq
-entry:
- %X = load <2 x i8>, <2 x i8>* %ptr
- %Y = zext <2 x i8> %X to <2 x i64>
- ret <2 x i64> %Y
-}
-
-define <4 x i32> @load_zext_4i8_to_4i32(<4 x i8> *%ptr) {
-; SSE2-LABEL: load_zext_4i8_to_4i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_4i8_to_4i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_4i8_to_4i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: load_zext_4i8_to_4i32:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX-NEXT: retq
-entry:
- %X = load <4 x i8>, <4 x i8>* %ptr
- %Y = zext <4 x i8> %X to <4 x i32>
- ret <4 x i32> %Y
-}
-
-define <4 x i64> @load_zext_4i8_to_4i64(<4 x i8> *%ptr) {
-; SSE2-LABEL: load_zext_4i8_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_4i8_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_4i8_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_zext_4i8_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_zext_4i8_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_zext_4i8_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %X = load <4 x i8>, <4 x i8>* %ptr
- %Y = zext <4 x i8> %X to <4 x i64>
- ret <4 x i64> %Y
-}
-
-define <8 x i16> @load_zext_8i8_to_8i16(<8 x i8> *%ptr) {
-; SSE2-LABEL: load_zext_8i8_to_8i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_8i8_to_8i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_8i8_to_8i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: load_zext_8i8_to_8i16:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX-NEXT: retq
-entry:
- %X = load <8 x i8>, <8 x i8>* %ptr
- %Y = zext <8 x i8> %X to <8 x i16>
- ret <8 x i16> %Y
-}
-
-define <8 x i32> @load_zext_8i8_to_8i32(<8 x i8> *%ptr) {
-; SSE2-LABEL: load_zext_8i8_to_8i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_8i8_to_8i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_8i8_to_8i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_zext_8i8_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_zext_8i8_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_zext_8i8_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %X = load <8 x i8>, <8 x i8>* %ptr
- %Y = zext <8 x i8> %X to <8 x i32>
- ret <8 x i32> %Y
-}
-
-define <8 x i32> @load_zext_16i8_to_8i32(<16 x i8> *%ptr) {
-; SSE2-LABEL: load_zext_16i8_to_8i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_16i8_to_8i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa (%rdi), %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_16i8_to_8i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa (%rdi), %xmm1
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_zext_16i8_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovdqa (%rdi), %xmm0
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_zext_16i8_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_zext_16i8_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %X = load <16 x i8>, <16 x i8>* %ptr
- %Y = shufflevector <16 x i8> %X, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
- %Z = zext <8 x i8> %Y to <8 x i32>
- ret <8 x i32> %Z
-}
-
-define <8 x i64> @load_zext_8i8_to_8i64(<8 x i8> *%ptr) {
-; SSE2-LABEL: load_zext_8i8_to_8i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_8i8_to_8i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
-; SSSE3-NEXT: movdqa %xmm3, %xmm0
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: movdqa %xmm3, %xmm1
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,zero,zero,zero,zero,xmm1[3],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: movdqa %xmm3, %xmm2
-; SSSE3-NEXT: pshufb {{.*#+}} xmm2 = xmm2[4],zero,zero,zero,zero,zero,zero,zero,xmm2[5],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm3 = xmm3[6],zero,zero,zero,zero,zero,zero,zero,xmm3[7],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_8i8_to_8i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_zext_8i8_to_8i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm2 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm3 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_zext_8i8_to_8i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_zext_8i8_to_8i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %X = load <8 x i8>, <8 x i8>* %ptr
- %Y = zext <8 x i8> %X to <8 x i64>
- ret <8 x i64> %Y
-}
-
-define <16 x i16> @load_zext_16i8_to_16i16(<16 x i8> *%ptr) {
-; SSE2-LABEL: load_zext_16i8_to_16i16:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_16i8_to_16i16:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa (%rdi), %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_16i8_to_16i16:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_zext_16i8_to_16i16:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_zext_16i8_to_16i16:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_zext_16i8_to_16i16:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX512-NEXT: retq
-entry:
- %X = load <16 x i8>, <16 x i8>* %ptr
- %Y = zext <16 x i8> %X to <16 x i16>
- ret <16 x i16> %Y
-}
-
-define <2 x i64> @load_zext_2i16_to_2i64(<2 x i16> *%ptr) {
-; SSE2-LABEL: load_zext_2i16_to_2i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_2i16_to_2i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_2i16_to_2i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: load_zext_2i16_to_2i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; AVX-NEXT: retq
-entry:
- %X = load <2 x i16>, <2 x i16>* %ptr
- %Y = zext <2 x i16> %X to <2 x i64>
- ret <2 x i64> %Y
-}
-
-define <4 x i32> @load_zext_4i16_to_4i32(<4 x i16> *%ptr) {
-; SSE2-LABEL: load_zext_4i16_to_4i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_4i16_to_4i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_4i16_to_4i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: load_zext_4i16_to_4i32:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX-NEXT: retq
-entry:
- %X = load <4 x i16>, <4 x i16>* %ptr
- %Y = zext <4 x i16> %X to <4 x i32>
- ret <4 x i32> %Y
-}
-
-define <4 x i64> @load_zext_4i16_to_4i64(<4 x i16> *%ptr) {
-; SSE2-LABEL: load_zext_4i16_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_4i16_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_4i16_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_zext_4i16_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_zext_4i16_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_zext_4i16_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %X = load <4 x i16>, <4 x i16>* %ptr
- %Y = zext <4 x i16> %X to <4 x i64>
- ret <4 x i64> %Y
-}
-
-define <8 x i32> @load_zext_8i16_to_8i32(<8 x i16> *%ptr) {
-; SSE2-LABEL: load_zext_8i16_to_8i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa (%rdi), %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_8i16_to_8i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa (%rdi), %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_8i16_to_8i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_zext_8i16_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_zext_8i16_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_zext_8i16_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
-; AVX512-NEXT: retq
-entry:
- %X = load <8 x i16>, <8 x i16>* %ptr
- %Y = zext <8 x i16> %X to <8 x i32>
- ret <8 x i32> %Y
-}
-
-define <2 x i64> @load_zext_2i32_to_2i64(<2 x i32> *%ptr) {
-; SSE2-LABEL: load_zext_2i32_to_2i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_2i32_to_2i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_2i32_to_2i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: load_zext_2i32_to_2i64:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; AVX-NEXT: retq
-entry:
- %X = load <2 x i32>, <2 x i32>* %ptr
- %Y = zext <2 x i32> %X to <2 x i64>
- ret <2 x i64> %Y
-}
-
-define <4 x i64> @load_zext_4i32_to_4i64(<4 x i32> *%ptr) {
-; SSE2-LABEL: load_zext_4i32_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movaps (%rdi), %xmm1
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: movaps %xmm1, %xmm0
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: load_zext_4i32_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movaps (%rdi), %xmm1
-; SSSE3-NEXT: xorps %xmm2, %xmm2
-; SSSE3-NEXT: movaps %xmm1, %xmm0
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: load_zext_4i32_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: load_zext_4i32_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: load_zext_4i32_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: load_zext_4i32_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX512-NEXT: retq
-entry:
- %X = load <4 x i32>, <4 x i32>* %ptr
- %Y = zext <4 x i32> %X to <4 x i64>
- ret <4 x i64> %Y
-}
-
-define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
-; SSE2-LABEL: zext_8i8_to_8i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_8i8_to_8i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_8i8_to_8i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_8i8_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_8i8_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_8i8_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %t = zext <8 x i8> %z to <8 x i32>
- ret <8 x i32> %t
-}
-
-define <8 x i32> @shuf_zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: shuf_zext_8i16_to_8i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuf_zext_8i16_to_8i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuf_zext_8i16_to_8i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuf_zext_8i16_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuf_zext_8i16_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: shuf_zext_8i16_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: retq
-entry:
- %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 0, i32 8, i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8>
- %Z = bitcast <16 x i16> %B to <8 x i32>
- ret <8 x i32> %Z
-}
-
-define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: shuf_zext_4i32_to_4i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movaps %xmm0, %xmm1
-; SSE2-NEXT: xorps %xmm2, %xmm2
-; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuf_zext_4i32_to_4i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movaps %xmm0, %xmm1
-; SSSE3-NEXT: xorps %xmm2, %xmm2
-; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuf_zext_4i32_to_4i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuf_zext_4i32_to_4i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuf_zext_4i32_to_4i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: shuf_zext_4i32_to_4i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: retq
-entry:
- %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 4, i32 1, i32 4, i32 2, i32 4, i32 3, i32 4>
- %Z = bitcast <8 x i32> %B to <4 x i64>
- ret <4 x i64> %Z
-}
-
-define <8 x i32> @shuf_zext_8i8_to_8i32(<8 x i8> %A) {
-; SSE2-LABEL: shuf_zext_8i8_to_8i32:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuf_zext_8i8_to_8i32:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuf_zext_8i8_to_8i32:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuf_zext_8i8_to_8i32:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuf_zext_8i8_to_8i32:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: shuf_zext_8i8_to_8i32:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %B = shufflevector <8 x i8> %A, <8 x i8> zeroinitializer, <32 x i32> <i32 0, i32 8, i32 8, i32 8, i32 1, i32 8, i32 8, i32 8, i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8, i32 6, i32 8, i32 8, i32 8, i32 7, i32 8, i32 8, i32 8>
- %Z = bitcast <32 x i8> %B to <8 x i32>
- ret <8 x i32> %Z
-}
-
-define <2 x i64> @shuf_zext_16i8_to_2i64_offset6(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: shuf_zext_16i8_to_2i64_offset6:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuf_zext_16i8_to_2i64_offset6:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuf_zext_16i8_to_2i64_offset6:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: psrlq $48, %xmm0
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuf_zext_16i8_to_2i64_offset6:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: shuf_zext_16i8_to_2i64_offset6:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuf_zext_16i8_to_2i64_offset6:
-; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: shuf_zext_16i8_to_2i64_offset6:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpsrlq $48, %xmm0, %xmm0
-; AVX512F-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: shuf_zext_16i8_to_2i64_offset6:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: retq
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <16 x i32> <i32 6, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
- %Z = bitcast <16 x i8> %B to <2 x i64>
- ret <2 x i64> %Z
-}
-
-define <4 x i64> @shuf_zext_16i8_to_4i64_offset11(<16 x i8> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: shuf_zext_16i8_to_4i64_offset11:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuf_zext_16i8_to_4i64_offset11:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,zero,zero,zero,zero,zero,xmm0[12],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[13],zero,zero,zero,zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuf_zext_16i8_to_4i64_offset11:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuf_zext_16i8_to_4i64_offset11:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm1 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuf_zext_16i8_to_4i64_offset11:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: shuf_zext_16i8_to_4i64_offset11:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %B = shufflevector <16 x i8> %A, <16 x i8> zeroinitializer, <32 x i32> <i32 11, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 12, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 13, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 14, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
- %Z = bitcast <32 x i8> %B to <4 x i64>
- ret <4 x i64> %Z
-}
-
-define <2 x i64> @shuf_zext_8i16_to_2i64_offset6(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: shuf_zext_8i16_to_2i64_offset6:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuf_zext_8i16_to_2i64_offset6:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuf_zext_8i16_to_2i64_offset6:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuf_zext_8i16_to_2i64_offset6:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: shuf_zext_8i16_to_2i64_offset6:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-SLOW-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuf_zext_8i16_to_2i64_offset6:
-; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: shuf_zext_8i16_to_2i64_offset6:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512F-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: shuf_zext_8i16_to_2i64_offset6:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[6,7],zero,zero,zero,zero,zero,zero,xmm0[8,9],zero,zero,zero,zero,zero,zero
-; AVX512BW-NEXT: retq
-entry:
- %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8>
- %Z = bitcast <8 x i16> %B to <2 x i64>
- ret <2 x i64> %Z
-}
-
-define <4 x i64> @shuf_zext_8i16_to_4i64_offset2(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; SSE41-NEXT: movdqa %xmm2, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3]
-; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX512-NEXT: retq
-entry:
- %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 2, i32 8, i32 8, i32 8, i32 3, i32 8, i32 8, i32 8, i32 4, i32 8, i32 8, i32 8, i32 5, i32 8, i32 8, i32 8>
- %Z = bitcast <16 x i16> %B to <4 x i64>
- ret <4 x i64> %Z
-}
-
-define <4 x i32> @shuf_zext_8i16_to_4i32_offset1(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: psrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: retq
-;
-; AVX2-SLOW-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; AVX2-SLOW: # %bb.0: # %entry
-; AVX2-SLOW-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX2-SLOW-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; AVX2-FAST: # %bb.0: # %entry
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero
-; AVX2-FAST-NEXT: retq
-;
-; AVX512F-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; AVX512F: # %bb.0: # %entry
-; AVX512F-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
-; AVX512F-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512F-NEXT: retq
-;
-; AVX512BW-LABEL: shuf_zext_8i16_to_4i32_offset1:
-; AVX512BW: # %bb.0: # %entry
-; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3],zero,zero,xmm0[4,5],zero,zero,xmm0[6,7],zero,zero,xmm0[8,9],zero,zero
-; AVX512BW-NEXT: retq
-entry:
- %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 3, i32 8, i32 4, i32 8>
- %Z = bitcast <8 x i16> %B to <4 x i32>
- ret <4 x i32> %Z
-}
-
-define <8 x i32> @shuf_zext_8i16_to_8i32_offset3(<8 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13]
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: shuf_zext_8i16_to_8i32_offset3:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: retq
-entry:
- %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> <i32 3, i32 8, i32 4, i32 8, i32 5, i32 8, i32 6, i32 8, i32 7, i32 8, i32 undef, i32 8, i32 undef, i32 8, i32 undef, i32 8>
- %Z = bitcast <16 x i16> %B to <8 x i32>
- ret <8 x i32> %Z
-}
-
-define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: shuf_zext_16i16_to_8i32_offset8:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm1, %xmm0
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuf_zext_16i16_to_8i32_offset8:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm1, %xmm0
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: pxor %xmm2, %xmm2
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuf_zext_16i16_to_8i32_offset8:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: shuf_zext_16i16_to_8i32_offset8:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: retq
-entry:
- %B = shufflevector <16 x i16> %A, <16 x i16> zeroinitializer, <16 x i32> <i32 8, i32 16, i32 9, i32 16, i32 10, i32 16, i32 11, i32 16, i32 12, i32 16, i32 undef, i32 16, i32 14, i32 16, i32 undef, i32 16>
- %Z = bitcast <16 x i16> %B to <8 x i32>
- ret <8 x i32> %Z
-}
-
-define <2 x i64> @shuf_zext_4i32_to_2i64_offset2(<4 x i32> %A) nounwind uwtable readnone ssp {
-; SSE-LABEL: shuf_zext_4i32_to_2i64_offset2:
-; SSE: # %bb.0: # %entry
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE-NEXT: retq
-;
-; AVX-LABEL: shuf_zext_4i32_to_2i64_offset2:
-; AVX: # %bb.0: # %entry
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT: retq
-entry:
- %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 3, i32 4>
- %Z = bitcast <4 x i32> %B to <2 x i64>
- ret <2 x i64> %Z
-}
-
-define <4 x i64> @shuf_zext_4i32_to_4i64_offset1(<4 x i32> %A) nounwind uwtable readnone ssp {
-; SSE2-LABEL: shuf_zext_4i32_to_4i64_offset1:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,0,4294967295,0]
-; SSE2-NEXT: pand %xmm1, %xmm0
-; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: shuf_zext_4i32_to_4i64_offset1:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [0,0,4294967295,0]
-; SSSE3-NEXT: pand %xmm1, %xmm0
-; SSSE3-NEXT: psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: shuf_zext_4i32_to_4i64_offset1:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pxor %xmm0, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5],xmm0[6,7]
-; SSE41-NEXT: psrldq {{.*#+}} xmm1 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: shuf_zext_4i32_to_4i64_offset1:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7]
-; AVX1-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: shuf_zext_4i32_to_4i64_offset1:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: shuf_zext_4i32_to_4i64_offset1:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,3,3]
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: retq
-entry:
- %B = shufflevector <4 x i32> %A, <4 x i32> zeroinitializer, <8 x i32> <i32 undef, i32 4, i32 2, i32 4, i32 3, i32 4, i32 undef, i32 4>
- %Z = bitcast <8 x i32> %B to <4 x i64>
- ret <4 x i64> %Z
-}
-
-define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
-; SSE2-LABEL: zext_32i8_to_32i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq %rdi, %rax
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: movdqa %xmm0, %xmm3
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm3, %xmm8
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSE2-NEXT: movdqa %xmm0, %xmm5
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm1, %xmm6
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm6, %xmm7
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE2-NEXT: movdqa %xmm1, %xmm4
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE2-NEXT: movdqa %xmm1, 112(%rdi)
-; SSE2-NEXT: movdqa %xmm4, 96(%rdi)
-; SSE2-NEXT: movdqa %xmm6, 80(%rdi)
-; SSE2-NEXT: movdqa %xmm7, 64(%rdi)
-; SSE2-NEXT: movdqa %xmm0, 48(%rdi)
-; SSE2-NEXT: movdqa %xmm5, 32(%rdi)
-; SSE2-NEXT: movdqa %xmm3, 16(%rdi)
-; SSE2-NEXT: movdqa %xmm8, (%rdi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_32i8_to_32i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movq %rdi, %rax
-; SSSE3-NEXT: pxor %xmm2, %xmm2
-; SSSE3-NEXT: movdqa %xmm0, %xmm3
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT: movdqa %xmm3, %xmm8
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
-; SSSE3-NEXT: movdqa %xmm0, %xmm5
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSSE3-NEXT: movdqa %xmm1, %xmm6
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; SSSE3-NEXT: movdqa %xmm6, %xmm7
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSSE3-NEXT: movdqa %xmm1, %xmm4
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSSE3-NEXT: movdqa %xmm1, 112(%rdi)
-; SSSE3-NEXT: movdqa %xmm4, 96(%rdi)
-; SSSE3-NEXT: movdqa %xmm6, 80(%rdi)
-; SSSE3-NEXT: movdqa %xmm7, 64(%rdi)
-; SSSE3-NEXT: movdqa %xmm0, 48(%rdi)
-; SSSE3-NEXT: movdqa %xmm5, 32(%rdi)
-; SSSE3-NEXT: movdqa %xmm3, 16(%rdi)
-; SSSE3-NEXT: movdqa %xmm8, (%rdi)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_32i8_to_32i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movq %rdi, %rax
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; SSE41-NEXT: movdqa %xmm1, 112(%rdi)
-; SSE41-NEXT: movdqa %xmm7, 96(%rdi)
-; SSE41-NEXT: movdqa %xmm6, 80(%rdi)
-; SSE41-NEXT: movdqa %xmm5, 64(%rdi)
-; SSE41-NEXT: movdqa %xmm0, 48(%rdi)
-; SSE41-NEXT: movdqa %xmm4, 32(%rdi)
-; SSE41-NEXT: movdqa %xmm3, 16(%rdi)
-; SSE41-NEXT: movdqa %xmm2, (%rdi)
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_32i8_to_32i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,3]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,0,1]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,0,1]
-; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
-; AVX1-NEXT: vmovaps %ymm4, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_32i8_to_32i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX2-NEXT: vmovdqa %ymm4, %ymm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_32i8_to_32i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
-; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0
-; AVX512-NEXT: retq
- %res = zext <32 x i8>%x to <32 x i32>
- ret <32 x i32> %res
-}
-
-define <2 x i32> @zext_2i8_to_2i32(<2 x i8>* %addr) {
-; SSE2-LABEL: zext_2i8_to_2i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movzwl (%rdi), %eax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: paddd %xmm0, %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_2i8_to_2i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movzwl (%rdi), %eax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT: paddd %xmm0, %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_2i8_to_2i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movzwl (%rdi), %eax
-; SSE41-NEXT: movd %eax, %xmm0
-; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; SSE41-NEXT: paddd %xmm0, %xmm0
-; SSE41-NEXT: retq
-;
-; AVX-LABEL: zext_2i8_to_2i32:
-; AVX: # %bb.0:
-; AVX-NEXT: movzwl (%rdi), %eax
-; AVX-NEXT: vmovd %eax, %xmm0
-; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; AVX-NEXT: vpaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: retq
- %x = load <2 x i8>, <2 x i8>* %addr, align 1
- %y = zext <2 x i8> %x to <2 x i32>
- %z = add <2 x i32>%y, %y
- ret <2 x i32>%z
-}
-
-define <4 x i32> @zext_4i17_to_4i32(<4 x i17>* %ptr) {
-; SSE2-LABEL: zext_4i17_to_4i32:
-; SSE2: # %bb.0:
-; SSE2-NEXT: movq (%rdi), %rax
-; SSE2-NEXT: movd %eax, %xmm0
-; SSE2-NEXT: movq %rax, %rcx
-; SSE2-NEXT: shrq $17, %rcx
-; SSE2-NEXT: movd %ecx, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movl 8(%rdi), %ecx
-; SSE2-NEXT: shll $13, %ecx
-; SSE2-NEXT: movq %rax, %rdx
-; SSE2-NEXT: shrq $51, %rdx
-; SSE2-NEXT: orl %ecx, %edx
-; SSE2-NEXT: movd %edx, %xmm1
-; SSE2-NEXT: shrq $34, %rax
-; SSE2-NEXT: movd %eax, %xmm2
-; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_4i17_to_4i32:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: movq (%rdi), %rax
-; SSSE3-NEXT: movd %eax, %xmm0
-; SSSE3-NEXT: movq %rax, %rcx
-; SSSE3-NEXT: shrq $17, %rcx
-; SSSE3-NEXT: movd %ecx, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movl 8(%rdi), %ecx
-; SSSE3-NEXT: shll $13, %ecx
-; SSSE3-NEXT: movq %rax, %rdx
-; SSSE3-NEXT: shrq $51, %rdx
-; SSSE3-NEXT: orl %ecx, %edx
-; SSSE3-NEXT: movd %edx, %xmm1
-; SSSE3-NEXT: shrq $34, %rax
-; SSSE3-NEXT: movd %eax, %xmm2
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSSE3-NEXT: pand {{.*}}(%rip), %xmm0
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_4i17_to_4i32:
-; SSE41: # %bb.0:
-; SSE41-NEXT: movl 8(%rdi), %eax
-; SSE41-NEXT: shll $13, %eax
-; SSE41-NEXT: movq (%rdi), %rcx
-; SSE41-NEXT: movq %rcx, %rdx
-; SSE41-NEXT: shrq $51, %rdx
-; SSE41-NEXT: orl %eax, %edx
-; SSE41-NEXT: movq %rcx, %rax
-; SSE41-NEXT: shrq $17, %rax
-; SSE41-NEXT: movd %ecx, %xmm0
-; SSE41-NEXT: pinsrd $1, %eax, %xmm0
-; SSE41-NEXT: shrq $34, %rcx
-; SSE41-NEXT: pinsrd $2, %ecx, %xmm0
-; SSE41-NEXT: pinsrd $3, %edx, %xmm0
-; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_4i17_to_4i32:
-; AVX1: # %bb.0:
-; AVX1-NEXT: movl 8(%rdi), %eax
-; AVX1-NEXT: shll $13, %eax
-; AVX1-NEXT: movq (%rdi), %rcx
-; AVX1-NEXT: movq %rcx, %rdx
-; AVX1-NEXT: shrq $51, %rdx
-; AVX1-NEXT: orl %eax, %edx
-; AVX1-NEXT: movq %rcx, %rax
-; AVX1-NEXT: shrq $17, %rax
-; AVX1-NEXT: vmovd %ecx, %xmm0
-; AVX1-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX1-NEXT: shrq $34, %rcx
-; AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_4i17_to_4i32:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl 8(%rdi), %eax
-; AVX2-NEXT: shll $13, %eax
-; AVX2-NEXT: movq (%rdi), %rcx
-; AVX2-NEXT: movq %rcx, %rdx
-; AVX2-NEXT: shrq $51, %rdx
-; AVX2-NEXT: orl %eax, %edx
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: shrq $17, %rax
-; AVX2-NEXT: vmovd %ecx, %xmm0
-; AVX2-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX2-NEXT: shrq $34, %rcx
-; AVX2-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
-; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [131071,131071,131071,131071]
-; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_4i17_to_4i32:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl 8(%rdi), %eax
-; AVX512-NEXT: shll $13, %eax
-; AVX512-NEXT: movq (%rdi), %rcx
-; AVX512-NEXT: movq %rcx, %rdx
-; AVX512-NEXT: shrq $51, %rdx
-; AVX512-NEXT: orl %eax, %edx
-; AVX512-NEXT: movq %rcx, %rax
-; AVX512-NEXT: shrq $17, %rax
-; AVX512-NEXT: vmovd %ecx, %xmm0
-; AVX512-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0
-; AVX512-NEXT: shrq $34, %rcx
-; AVX512-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0
-; AVX512-NEXT: vpinsrd $3, %edx, %xmm0, %xmm0
-; AVX512-NEXT: vpbroadcastd {{.*#+}} xmm1 = [131071,131071,131071,131071]
-; AVX512-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: retq
- %a = load <4 x i17>, <4 x i17>* %ptr
- %b = zext <4 x i17> %a to <4 x i32>
- ret <4 x i32> %b
-}
-
-define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp {
-; SSE2-LABEL: zext_8i6_to_8i64:
-; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movd %edi, %xmm0
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE2-NEXT: paddw {{.*}}(%rip), %xmm3
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [63,63]
-; SSE2-NEXT: pand %xmm4, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: pand %xmm4, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: pand %xmm4, %xmm2
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7]
-; SSE2-NEXT: pand %xmm4, %xmm3
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: zext_8i6_to_8i64:
-; SSSE3: # %bb.0: # %entry
-; SSSE3-NEXT: movd %edi, %xmm0
-; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSSE3-NEXT: paddw {{.*}}(%rip), %xmm3
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,1,0,3]
-; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,6,7]
-; SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [63,63]
-; SSSE3-NEXT: pand %xmm4, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,3]
-; SSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,6,7]
-; SSSE3-NEXT: pand %xmm4, %xmm1
-; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,1,2,3]
-; SSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,6,7]
-; SSSE3-NEXT: pand %xmm4, %xmm2
-; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,3,3]
-; SSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,6,7]
-; SSSE3-NEXT: pand %xmm4, %xmm3
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: zext_8i6_to_8i64:
-; SSE41: # %bb.0: # %entry
-; SSE41-NEXT: movd %edi, %xmm0
-; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
-; SSE41-NEXT: paddw {{.*}}(%rip), %xmm3
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [63,63]
-; SSE41-NEXT: pand %xmm4, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,2,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT: pand %xmm4, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,0,1]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; SSE41-NEXT: pand %xmm4, %xmm2
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; SSE41-NEXT: pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
-; SSE41-NEXT: pand %xmm4, %xmm3
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: zext_8i6_to_8i64:
-; AVX1: # %bb.0: # %entry
-; AVX1-NEXT: vmovd %edi, %xmm0
-; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,0,1]
-; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: zext_8i6_to_8i64:
-; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovd %edi, %xmm0
-; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX2-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: zext_8i6_to_8i64:
-; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vmovd %edi, %xmm0
-; AVX512-NEXT: vpbroadcastw %xmm0, %xmm0
-; AVX512-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
-; AVX512-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; AVX512-NEXT: retq
-entry:
- %a = trunc i32 %x to i6
- %b = insertelement <8 x i6> undef, i6 %a, i32 0
- %c = shufflevector <8 x i6> %b, <8 x i6> undef, <8 x i32> zeroinitializer
- %d = add <8 x i6> %c, <i6 0, i6 1, i6 2, i6 3, i6 4, i6 5, i6 6, i6 7>
- %e = zext <8 x i6> %d to <8 x i64>
- ret <8 x i64> %e
-}
-
-define <4 x i64> @splatshuf_zext_v4i64(<4 x i32> %x) {
-; SSE2-LABEL: splatshuf_zext_v4i64:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: splatshuf_zext_v4i64:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSSE3-NEXT: pxor %xmm1, %xmm1
-; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: splatshuf_zext_v4i64:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatshuf_zext_v4i64:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatshuf_zext_v4i64:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: splatshuf_zext_v4i64:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpbroadcastd %xmm0, %xmm0
-; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT: retq
- %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
- %ext = zext <4 x i32> %shuf to <4 x i64>
- ret <4 x i64> %ext
-}
-
-define <8 x i32> @splatshuf_zext_v8i32_matching_undefs(<8 x i16> %x) {
-; SSE2-LABEL: splatshuf_zext_v8i32_matching_undefs:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,7,7]
-; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: splatshuf_zext_v8i32_matching_undefs:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[u,u],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: splatshuf_zext_v8i32_matching_undefs:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,8,9,10,11,12,13,14,15]
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatshuf_zext_v8i32_matching_undefs:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[6,7],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatshuf_zext_v8i32_matching_undefs:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: splatshuf_zext_v8i32_matching_undefs:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,6,7,6,7,14,15,0,1,6,7,6,7,14,15]
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: retq
- %shuf = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 0, i32 undef, i32 3, i32 7, i32 0, i32 undef, i32 3, i32 7>
- %ext = zext <8 x i16> %shuf to <8 x i32>
- ret <8 x i32> %ext
-}
-
-define <8 x i32> @splatshuf_zext_v8i32_unmatched_undef(<8 x i16> %x) {
-; SSE2-LABEL: splatshuf_zext_v8i32_unmatched_undef:
-; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: splatshuf_zext_v8i32_unmatched_undef:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1],zero,zero,xmm0[2,3],zero,zero,xmm0[6,7],zero,zero,xmm0[14,15],zero,zero
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: splatshuf_zext_v8i32_unmatched_undef:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,14,15,6,7,12,13,14,15]
-; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatshuf_zext_v8i32_unmatched_undef:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatshuf_zext_v8i32_unmatched_undef:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]
-; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: splatshuf_zext_v8i32_unmatched_undef:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,14,15,0,1,6,7,6,7,14,15]
-; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: retq
- %shuf = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 3, i32 7, i32 0, i32 undef, i32 3, i32 7>
- %ext = zext <8 x i16> %shuf to <8 x i32>
- ret <8 x i32> %ext
-}
-
-define <16 x i16> @splatshuf_zext_v16i16(<16 x i8> %x) {
-; SSE2-LABEL: splatshuf_zext_v16i16:
-; SSE2: # %bb.0:
-; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: splatshuf_zext_v16i16:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero
-; SSSE3-NEXT: movdqa %xmm0, %xmm1
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: splatshuf_zext_v16i16:
-; SSE41: # %bb.0:
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15]
-; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: retq
-;
-; AVX1-LABEL: splatshuf_zext_v16i16:
-; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero,xmm0[14],zero
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: retq
-;
-; AVX2-LABEL: splatshuf_zext_v16i16:
-; AVX2: # %bb.0:
-; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
-; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: splatshuf_zext_v16i16:
-; AVX512: # %bb.0:
-; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14]
-; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX512-NEXT: retq
- %shuf = shufflevector <16 x i8> %x, <16 x i8> undef, <16 x i32> <i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14, i32 14>
- %ext = zext <16 x i8> %shuf to <16 x i16>
- ret <16 x i16> %ext
-}
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=NARROW
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=WIDE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=WIDE
; FIXME: We shouldn't require both a movd and an insert in the wide version.
define void @update(i64* %dst_i, i64* %src_i, i32 %n) nounwind {
-; NARROW-LABEL: update:
-; NARROW: # %bb.0: # %entry
-; NARROW-NEXT: subl $12, %esp
-; NARROW-NEXT: movl $0, (%esp)
-; NARROW-NEXT: pcmpeqd %xmm0, %xmm0
-; NARROW-NEXT: movdqa {{.*#+}} xmm1 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
-; NARROW-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; NARROW-NEXT: .p2align 4, 0x90
-; NARROW-NEXT: .LBB0_1: # %forcond
-; NARROW-NEXT: # =>This Inner Loop Header: Depth=1
-; NARROW-NEXT: movl (%esp), %eax
-; NARROW-NEXT: cmpl {{[0-9]+}}(%esp), %eax
-; NARROW-NEXT: jge .LBB0_3
-; NARROW-NEXT: # %bb.2: # %forbody
-; NARROW-NEXT: # in Loop: Header=BB0_1 Depth=1
-; NARROW-NEXT: movl (%esp), %eax
-; NARROW-NEXT: leal (,%eax,8), %ecx
-; NARROW-NEXT: movl {{[0-9]+}}(%esp), %edx
-; NARROW-NEXT: addl %ecx, %edx
-; NARROW-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; NARROW-NEXT: addl {{[0-9]+}}(%esp), %ecx
-; NARROW-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; NARROW-NEXT: movq {{.*#+}} xmm3 = mem[0],zero
-; NARROW-NEXT: psubb %xmm0, %xmm3
-; NARROW-NEXT: psrlw $2, %xmm3
-; NARROW-NEXT: pand %xmm1, %xmm3
-; NARROW-NEXT: pxor %xmm2, %xmm3
-; NARROW-NEXT: psubb %xmm2, %xmm3
-; NARROW-NEXT: movq %xmm3, (%edx,%eax,8)
-; NARROW-NEXT: incl (%esp)
-; NARROW-NEXT: jmp .LBB0_1
-; NARROW-NEXT: .LBB0_3: # %afterfor
-; NARROW-NEXT: addl $12, %esp
-; NARROW-NEXT: retl
-;
; WIDE-LABEL: update:
; WIDE: # %bb.0: # %entry
; WIDE-NEXT: subl $12, %esp
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 -x86-experimental-vector-widening-legalization | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
define <4 x i32> @zext_v4i8_to_v4i32(<4 x i8>* %ptr) {
; X86-LABEL: zext_v4i8_to_v4i32:
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -x86-experimental-vector-widening-legalization -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
; Test multiplies of various narrow types.