From 371f918f705b7e4c42e5ba3559048d0649e6c649 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 5 Feb 2017 22:25:42 +0000 Subject: [PATCH] [AVX-512] Add scalar masked add/sub/mul/div intrinsic instructions to the load folding tables. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294152 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrInfo.cpp | 24 +++++++++++ test/CodeGen/X86/avx512-intrinsics.ll | 62 +++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index ced83c140de..ac44f735bc6 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2315,6 +2315,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX-512 masked instructions { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 }, { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, + { X86::VADDSDZrr_Intkz, X86::VADDSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VADDSSZrr_Intkz, X86::VADDSSZrm_Intkz, TB_NO_REVERSE }, { X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 }, { X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 }, { X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 }, @@ -2323,6 +2325,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 }, { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 }, { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, + { X86::VDIVSDZrr_Intkz, X86::VDIVSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VDIVSSZrr_Intkz, X86::VDIVSSZrm_Intkz, TB_NO_REVERSE }, { X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 }, { X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 }, { X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 }, @@ -2341,6 +2345,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, + { X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VMULSSZrr_Intkz, X86::VMULSSZrm_Intkz, TB_NO_REVERSE }, { X86::VORPDZrrkz, X86::VORPDZrmkz, 0 }, { X86::VORPSZrrkz, X86::VORPSZrmkz, 0 }, { X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 }, @@ -2389,6 +2395,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 }, { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, + { X86::VSUBSDZrr_Intkz, X86::VSUBSDZrm_Intkz, TB_NO_REVERSE }, + { X86::VSUBSSZrr_Intkz, X86::VSUBSSZrm_Intkz, TB_NO_REVERSE }, { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 }, { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 }, { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 }, @@ -2642,6 +2650,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX-512 foldable masked instructions { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, + { X86::VADDSDZrr_Intk, X86::VADDSDZrm_Intk, TB_NO_REVERSE }, + { X86::VADDSSZrr_Intk, X86::VADDSSZrm_Intk, TB_NO_REVERSE }, { X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 }, { X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 }, { X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 }, @@ -2650,6 +2660,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VANDPSZrrk, X86::VANDPSZrmk, 0 }, { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 }, { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, + { X86::VDIVSDZrr_Intk, X86::VDIVSDZrm_Intk, TB_NO_REVERSE }, + { X86::VDIVSSZrr_Intk, X86::VDIVSSZrm_Intk, TB_NO_REVERSE }, { X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 }, { X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 }, { X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 }, @@ -2668,6 +2680,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, + { X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE }, + { X86::VMULSSZrr_Intk, X86::VMULSSZrm_Intk, TB_NO_REVERSE }, { X86::VORPDZrrk, X86::VORPDZrmk, 0 }, { X86::VORPSZrrk, X86::VORPSZrmk, 0 }, { X86::VPADDBZrrk, X86::VPADDBZrmk, 0 }, @@ -2729,6 +2743,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 }, { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, + { X86::VSUBSDZrr_Intk, X86::VSUBSDZrm_Intk, TB_NO_REVERSE }, + { X86::VSUBSSZrr_Intk, X86::VSUBSSZrm_Intk, TB_NO_REVERSE }, { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 }, { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 }, { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 }, @@ -7726,6 +7742,10 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int: case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int: case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int: + case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz: + case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz: + case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz: + case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz: case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int: case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int: case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int: @@ -7771,6 +7791,10 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int: case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int: case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int: + case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz: + case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz: + case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz: + case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz: case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int: case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int: case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int: diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index fbd49a3258e..76c4246fdf8 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -2301,6 +2301,39 @@ define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) { ret <4 x float> %res } +define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_ss_current_memfold: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %a1.val = load float, float* %a1 + %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 + %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 + %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 + %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, float* %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_add_ss_current_memfold: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vaddss (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a1.val = load float, float* %a1 + %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 + %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 + %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 + %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 + %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4) + ret <4 x float> %res +} + declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { @@ -2383,6 +2416,35 @@ define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) { ret <2 x double> %res } +define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_add_sd_current_memfold: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %a1.val = load double, double* %a1 + %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 + %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, double* %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_add_sd_current_memfold: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vaddsd (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a1.val = load double, double* %a1 + %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 + %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 + %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4) + ret <2 x double> %res +} + declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { -- 2.50.1