From: Craig Topper Date: Sat, 26 Nov 2016 08:21:48 +0000 (+0000) Subject: [AVX-512] Add masked 128/256-bit integer add/sub instructions to load folding tables. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=d3611566cf87960fc24d30d0d4fe46525beda903;p=llvm [AVX-512] Add masked 128/256-bit integer add/sub instructions to load folding tables. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@287974 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 91ea11d37ee..ed2ed38f622 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2191,12 +2191,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, { X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 }, { X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 }, + { X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 }, + { X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 }, + { X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 }, + { X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 }, + { X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 }, + { X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 }, + { X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 }, + { X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 }, { X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 }, { X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 }, { X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 }, { X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 }, { X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 }, { X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 }, + { X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 }, + { X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 }, + { X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 }, + { X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 }, + { X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 }, + { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 }, + { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 }, + { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 }, { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 }, { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 }, { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, @@ -2225,12 +2241,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, { X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 }, { X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 }, + { X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 }, + { X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 }, + { X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 }, + { X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 }, + { X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 }, + { X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 }, + { X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 }, + { X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 }, { X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 }, { X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 }, { X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 }, { X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 }, { X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 }, { X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 }, + { X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 }, + { X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 }, + { X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 }, + { X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 }, + { X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 }, + { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 }, + { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 }, + { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 }, { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 }, { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 }, { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, @@ -2328,12 +2360,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, { X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 }, { X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 }, + { X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 }, + { X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 }, + { X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 }, + { X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 }, + { X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 }, + { X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 }, + { X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 }, + { X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 }, { X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 }, { X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 }, { X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 }, { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 }, { X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 }, { X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 }, + { X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 }, + { X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 }, + { X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 }, + { X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 }, + { X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 }, + { X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 }, + { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 }, + { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 }, { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 }, { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 }, { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 }, @@ -2366,12 +2414,28 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, { X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 }, { X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 }, + { X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 }, + { X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 }, + { X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 }, + { X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 }, + { X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 }, + { X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 }, + { X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 }, + { X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 }, { X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 }, { X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 }, { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 }, { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 }, { X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 }, { X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 }, + { X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 }, + { X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 }, + { X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 }, + { X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 }, + { X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 }, + { X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 }, + { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 }, + { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 }, { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 }, { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 }, { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 }, diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll index 282de4bedac..1229c4273c1 100644 --- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -16,6 +16,28 @@ define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) { ret <16 x i8> %2 } +define <16 x i8> @stack_fold_paddb_mask(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %a2, i16 %mask) { + ;CHECK-LABEL: stack_fold_paddb_mask + ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = add <16 x i8> %a0, %a1 + %3 = bitcast i16 %mask to <16 x i1> + ; load needed to keep the operation from being scheduled about the asm block + %4 = load <16 x i8>, <16 x i8>* %a2 + %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4 + ret <16 x i8> %5 +} + +define <16 x i8> @stack_fold_paddb_maskz(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) { + ;CHECK-LABEL: stack_fold_paddb_maskz + ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = add <16 x i8> %a0, %a1 + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer + ret <16 x i8> %4 +} + define <32 x i8> @stack_fold_paddb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ;CHECK-LABEL: stack_fold_paddb_ymm ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload @@ -24,6 +46,28 @@ define <32 x i8> @stack_fold_paddb_ymm(<32 x i8> %a0, <32 x i8> %a1) { ret <32 x i8> %2 } +define <32 x i8> @stack_fold_paddb_mask_ymm(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %a2, i32 %mask) { + ;CHECK-LABEL: stack_fold_paddb_mask_ymm + ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = add <32 x i8> %a0, %a1 + %3 = bitcast i32 %mask to <32 x i1> + ; load needed to keep the operation from being scheduled about the asm block + %4 = load <32 x i8>, <32 x i8>* %a2 + %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4 + ret <32 x i8> %5 +} + +define <32 x i8> @stack_fold_paddb_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { + ;CHECK-LABEL: stack_fold_paddb_maskz_ymm + ;CHECK: vpaddb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = add <32 x i8> %a0, %a1 + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer + ret <32 x i8> %4 +} + define <4 x i32> @stack_fold_paddd(<4 x i32> %a0, <4 x i32> %a1) { ;CHECK-LABEL: stack_fold_paddd ;CHECK: vpaddd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload