From: Craig Topper Date: Fri, 2 Dec 2016 07:06:30 +0000 (+0000) Subject: [AVX-512] Add EVEX PSHUFB instructions to load folding tables. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=9c40e33b57c4791029869559705ffa721eb87f48;p=llvm [AVX-512] Add EVEX PSHUFB instructions to load folding tables. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@288482 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 0f904930127..f5821d71288 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1873,6 +1873,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, { X86::VPORDZrr, X86::VPORDZrm, 0 }, { X86::VPORQZrr, X86::VPORQZrm, 0 }, + { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 }, { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 }, { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 }, { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 }, @@ -2023,6 +2024,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPORDZ256rr, X86::VPORDZ256rm, 0 }, { X86::VPORQZ128rr, X86::VPORQZ128rm, 0 }, { X86::VPORQZ256rr, X86::VPORQZ256rm, 0 }, + { X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 }, + { X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 }, { X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 }, { X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 }, { X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 }, @@ -2297,6 +2300,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 }, { X86::VPORDZrrkz, X86::VPORDZrmkz, 0 }, { X86::VPORQZrrkz, X86::VPORQZrmkz, 0 }, + { X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 }, { X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 }, { X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 }, { X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 }, @@ -2363,6 +2367,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 }, { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 }, { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 }, + { X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 }, { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 }, { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 }, { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 }, @@ -2425,6 +2430,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 }, { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 }, { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 }, + { X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 }, { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 }, { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 }, { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 }, @@ -2562,6 +2568,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 }, { X86::VPORDZrrk, X86::VPORDZrmk, 0 }, { X86::VPORQZrrk, X86::VPORQZrmk, 0 }, + { X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 }, { X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 }, { X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 }, { X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 }, @@ -2631,6 +2638,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 }, { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 }, { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 }, + { X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 }, { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 }, { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 }, { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 }, @@ -2697,6 +2705,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 }, { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 }, { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 }, + { X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 }, { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 }, { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 }, { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 }, diff --git a/test/CodeGen/X86/stack-folding-int-avx512.ll b/test/CodeGen/X86/stack-folding-int-avx512.ll index 577b1c06895..6b67e506eaa 100644 --- a/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -701,3 +701,29 @@ define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer ret <64 x i8> %4 } + +define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) { + ;CHECK-LABEL: stack_fold_pshufb_zmm + ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1) + ret <64 x i8> %2 +} +declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) + +define <64 x i8> @stack_fold_pshufb_zmm_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { + ;CHECK-LABEL: stack_fold_pshufb_zmm_mask + ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <64 x i8>, <64 x i8>* %passthru + %3 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %2, i64 %mask) + ret <64 x i8> %3 +} + +define <64 x i8> @stack_fold_pshufb_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { + ;CHECK-LABEL: stack_fold_pshufb_zmm_maskz + ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> zeroinitializer, i64 %mask) + ret <64 x i8> %2 +} diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll index d11e744c84b..62a7829026b 100644 --- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -992,3 +992,55 @@ define <32 x i8> @stack_fold_punpckhbw_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer ret <32 x i8> %4 } + +define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) { + ;CHECK-LABEL: stack_fold_pshufb + ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> undef, i16 -1) + ret <16 x i8> %2 +} +declare <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) nounwind readnone + +define <16 x i8> @stack_fold_pshufb_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { + ;CHECK-LABEL: stack_fold_pshufb_mask + ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <16 x i8>, <16 x i8>* %passthru + %3 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %2, i16 %mask) + ret <16 x i8> %3 +} + +define <16 x i8> @stack_fold_pshufb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { + ;CHECK-LABEL: stack_fold_pshufb_maskz + ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %2 +} + +define <32 x i8> @stack_fold_pshufb_ymm(<32 x i8> %a0, <32 x i8> %a1) { + ;CHECK-LABEL: stack_fold_pshufb_ymm + ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> undef, i32 -1) + ret <32 x i8> %2 +} +declare <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <32 x i8> @stack_fold_pshufb_ymm_mask(<32 x i8>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) { + ;CHECK-LABEL: stack_fold_pshufb_ymm_mask + ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = load <32 x i8>, <32 x i8>* %passthru + %3 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %2, i32 %mask) + ret <32 x i8> %3 +} + +define <32 x i8> @stack_fold_pshufb_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { + ;CHECK-LABEL: stack_fold_pshufb_ymm_maskz + ;CHECK: vpshufb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %2 +}