From: Craig Topper Date: Sun, 5 Feb 2017 22:25:46 +0000 (+0000) Subject: [AVX-512] Add scalar masked max/min intrinsic instructions to the load folding tables. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=053d7dd312d55d535f8d87df8451ac0d3007c671;p=llvm [AVX-512] Add scalar masked max/min intrinsic instructions to the load folding tables. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@294153 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index ac44f735bc6..8885393c742 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -2339,10 +2339,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 }, { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 }, { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, + { X86::VMAXSDZrr_Intkz, X86::VMAXSDZrm_Intkz, 0 }, + { X86::VMAXSSZrr_Intkz, X86::VMAXSSZrm_Intkz, 0 }, { X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 }, { X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 }, { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, + { X86::VMINSDZrr_Intkz, X86::VMINSDZrm_Intkz, 0 }, + { X86::VMINSSZrr_Intkz, X86::VMINSSZrm_Intkz, 0 }, { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, { X86::VMULSDZrr_Intkz, X86::VMULSDZrm_Intkz, TB_NO_REVERSE }, @@ -2674,10 +2678,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 }, { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 }, { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, + { X86::VMAXSDZrr_Intk, X86::VMAXSDZrm_Intk, 0 }, + { X86::VMAXSSZrr_Intk, X86::VMAXSSZrm_Intk, 0 }, { X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 }, { X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 }, { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, + { X86::VMINSDZrr_Intk, X86::VMINSDZrm_Intk, 0 }, + { X86::VMINSSZrr_Intk, X86::VMINSSZrm_Intk, 0 }, { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, { X86::VMULSDZrr_Intk, X86::VMULSDZrm_Intk, TB_NO_REVERSE }, @@ -7744,6 +7752,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int: case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz: case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz: + case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz: + case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz: case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz: case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz: case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int: @@ -7793,6 +7803,8 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int: case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz: case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz: + case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz: + case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz: case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz: case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz: case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int: diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 76c4246fdf8..1213fb1ec66 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -2510,6 +2510,39 @@ define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) { %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4) ret <4 x float> %res } + +define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_max_ss_memfold: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %a1.val = load float, float* %a1 + %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 + %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 + %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 + %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4) + ret <4 x float> %res +} + +define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, float* %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_max_ss_memfold: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a1.val = load float, float* %a1 + %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0 + %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1 + %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2 + %a1v = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3 + %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4) + ret <4 x float> %res +} declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { @@ -2576,6 +2609,35 @@ define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) { ret <2 x double> %res } +define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_mask_max_sd_memfold: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovapd %xmm1, %xmm0 +; CHECK-NEXT: retq + %a1.val = load double, double* %a1 + %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 + %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4) + ret <2 x double> %res +} + +define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, double* %a1, i8 %mask) { +; CHECK-LABEL: test_maskz_max_sd_memfold: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %esi +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vmaxsd (%rdi), %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %a1.val = load double, double* %a1 + %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0 + %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1 + %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4) + ret <2 x double> %res +} + define <2 x double> @test_x86_avx512_cvtsi2sd64(<2 x double> %a, i64 %b) { ; CHECK-LABEL: test_x86_avx512_cvtsi2sd64: ; CHECK: ## BB#0: