From: Craig Topper Date: Fri, 2 Dec 2016 07:57:11 +0000 (+0000) Subject: [AVX-512] Add EVEX vpshuflw/vpshufhw/vpshufd instructions to load folding tables. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1244527f8c897c72af4c6bedb85f6cc6cc3e38e0;p=llvm [AVX-512] Add EVEX vpshuflw/vpshufhw/vpshufd instructions to load folding tables. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288484 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index f5821d71288..d5f33448e98 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -895,6 +895,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 }, { X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 }, { X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 }, + { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 }, + { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 }, + { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 }, // AVX-512 foldable instructions (256-bit versions) { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, @@ -923,6 +926,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 }, { X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 }, { X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE }, + { X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 }, + { X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 }, + { X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 }, // AVX-512 foldable instructions (128-bit versions) { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, @@ -949,6 +955,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE }, { X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE }, { X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE }, + { X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 }, + { X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 }, + { X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 }, // F16C foldable instructions { X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 }, @@ -2092,6 +2101,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 }, { X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 }, { X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 }, + { X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 }, + { X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 }, + { X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 }, // AVX-512VL 256-bit masked foldable instructions { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE }, @@ -2106,6 +2118,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 }, { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 }, { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE }, + { X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 }, + { X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 }, + { X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 }, // AVX-512VL 128-bit masked foldable instructions { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE }, @@ -2120,6 +2135,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE }, { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE }, { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE }, + { X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 }, + { X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 }, + { X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 }, // AES foldable instructions { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 }, @@ -2471,6 +2489,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 }, { X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 }, { X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 }, + { X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 }, + { X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 }, + { X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 }, // AVX-512VL 256-bit masked foldable instructions { X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE }, @@ -2485,6 +2506,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 }, { X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 }, { X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE }, + { X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 }, + { X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 }, + { X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 }, // AVX-512VL 128-bit masked foldable instructions { X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE }, @@ -2499,6 +2523,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE }, { X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE }, { X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE }, + { X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 }, + { X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 }, + { X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 }, }; for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) { diff --git a/test/CodeGen/X86/stack-folding-int-avx512.ll b/test/CodeGen/X86/stack-folding-int-avx512.ll index 6b67e506eaa..e4bb621957d 100644 --- a/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -727,3 +727,87 @@ define <64 x i8> @stack_fold_pshufb_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %2 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> zeroinitializer, i64 %mask) ret <64 x i8> %2 } + +define <16 x i32> @stack_fold_pshufd_zmm(<16 x i32> %a0) { + ;CHECK-LABEL: stack_fold_pshufd_zmm + ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> + ret <16 x i32> %2 +} + +define <16 x i32> @stack_fold_pshufd_zmm_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) { + ;CHECK-LABEL: stack_fold_pshufd_zmm_mask + ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %passthru + ret <16 x i32> %4 +} + +define <16 x i32> @stack_fold_pshufd_zmm_maskz(<16 x i32> %a0, i16 %mask) { + ;CHECK-LABEL: stack_fold_pshufd_zmm_maskz + ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer + ret <16 x i32> %4 +} + +define <32 x i16> @stack_fold_pshufhw_zmm(<32 x i16> %a0) { + ;CHECK-LABEL: stack_fold_pshufhw_zmm + ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + ret <32 x i16> %2 +} + +define <32 x i16> @stack_fold_pshufhw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { + ;CHECK-LABEL: stack_fold_pshufhw_zmm_mask + ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru + ret <32 x i16> %4 +} + +define <32 x i16> @stack_fold_pshufhw_zmm_maskz(<32 x i16> %a0, i32 %mask) { + ;CHECK-LABEL: stack_fold_pshufhw_zmm_maskz + ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer + ret <32 x i16> %4 +} + +define <32 x i16> @stack_fold_pshuflw_zmm(<32 x i16> %a0) { + ;CHECK-LABEL: stack_fold_pshuflw_zmm + ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + ret <32 x i16> %2 +} + +define <32 x i16> @stack_fold_pshuflw_zmm_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) { + ;CHECK-LABEL: stack_fold_pshuflw_zmm_mask + ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %passthru + ret <32 x i16> %4 +} + +define <32 x i16> @stack_fold_pshuflw_zmm_maskz(<32 x i16> %a0, i32 %mask) { + ;CHECK-LABEL: stack_fold_pshuflw_zmm_maskz + ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <32 x i16> %a0, <32 x i16> undef, <32 x i32> + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer + ret <32 x i16> %4 +} diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll index 62a7829026b..f5e2a55d607 100644 --- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -1044,3 +1044,173 @@ define <32 x i8> @stack_fold_pshufb_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %2 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> zeroinitializer, i32 %mask) ret <32 x i8> %2 } + +define <4 x i32> @stack_fold_pshufd(<4 x i32> %a0) { + ;CHECK-LABEL: stack_fold_pshufd + ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> + ret <4 x i32> %2 +} + +define <4 x i32> @stack_fold_pshufd_mask(<4 x i32> %passthru, <4 x i32> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_pshufd_mask + ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> + %3 = bitcast i8 %mask to <8 x i1> + %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> %passthru + ret <4 x i32> %5 +} + +define <4 x i32> @stack_fold_pshufd_maskz(<4 x i32> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_pshufd_maskz + ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> + %3 = bitcast i8 %mask to <8 x i1> + %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> + %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer + ret <4 x i32> %5 +} + +define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) { + ;CHECK-LABEL: stack_fold_pshufhw + ;CHECK: vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + ret <8 x i16> %2 +} + +define <8 x i16> @stack_fold_pshufhw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_pshufhw_mask + ;CHECK: vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru + ret <8 x i16> %4 +} + +define <8 x i16> @stack_fold_pshufhw_maskz(<8 x i16> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_pshufhw_maskz + ;CHECK: vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer + ret <8 x i16> %4 +} + +define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) { + ;CHECK-LABEL: stack_fold_pshuflw + ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + ret <8 x i16> %2 +} + +define <8 x i16> @stack_fold_pshuflw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_pshuflw_mask + ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru + ret <8 x i16> %4 +} + +define <8 x i16> @stack_fold_pshuflw_maskz(<8 x i16> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_pshuflw_maskz + ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer + ret <8 x i16> %4 +} + +define <8 x i32> @stack_fold_pshufd_ymm(<8 x i32> %a0) { + ;CHECK-LABEL: stack_fold_pshufd_ymm + ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> + ret <8 x i32> %2 +} + +define <8 x i32> @stack_fold_pshufd_ymm_mask(<8 x i32> %passthru, <8 x i32> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_pshufd_ymm_mask + ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %passthru + ret <8 x i32> %4 +} + +define <8 x i32> @stack_fold_pshufd_ymm_maskz(<8 x i32> %a0, i8 %mask) { + ;CHECK-LABEL: stack_fold_pshufd_ymm_maskz + ;CHECK: vpshufd $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> + %3 = bitcast i8 %mask to <8 x i1> + %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer + ret <8 x i32> %4 +} + +define <16 x i16> @stack_fold_vpshufhw_ymm(<16 x i16> %a0) { + ;CHECK-LABEL: stack_fold_vpshufhw_ymm + ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + ret <16 x i16> %2 +} + +define <16 x i16> @stack_fold_vpshufhw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) { + ;CHECK-LABEL: stack_fold_vpshufhw_ymm_mask + ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %passthru + ret <16 x i16> %4 +} + +define <16 x i16> @stack_fold_vpshufhw_ymm_maskz(<16 x i16> %a0, i16 %mask) { + ;CHECK-LABEL: stack_fold_vpshufhw_ymm_maskz + ;CHECK: vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer + ret <16 x i16> %4 +} + +define <16 x i16> @stack_fold_vpshuflw_ymm(<16 x i16> %a0) { + ;CHECK-LABEL: stack_fold_vpshuflw_ymm + ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + ret <16 x i16> %2 +} + +define <16 x i16> @stack_fold_vpshuflw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) { + ;CHECK-LABEL: stack_fold_vpshuflw_ymm_mask + ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %passthru + ret <16 x i16> %4 +} + +define <16 x i16> @stack_fold_vpshuflw_ymm_maskz(<16 x i16> %a0, i16 %mask) { + ;CHECK-LABEL: stack_fold_vpshuflw_ymm_maskz + ;CHECK: vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer + ret <16 x i16> %4 +}