From 9fc96e56469da510767dba9a7963283801ba15c6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 26 Oct 2016 04:59:58 +0000 Subject: [PATCH] [AVX-512] Add scalar vfmsub/vfnmsub mask3 intrinsics Summary: Clang's intrinsic header currently tries to negate the third operand of a vfmadd mask3 in order to create vfmsub, but this fails isel. This patch adds scalar vfmsub and vfnmsub mask3 that we can use instead to avoid the negate. This is consistent with the packed instructions. Reviewers: igorb, delena Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D25933 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@285173 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 24 ++++++ lib/Target/X86/X86IntrinsicsInfo.h | 4 + test/CodeGen/X86/avx512-intrinsics.ll | 112 ++++++++++++++++++++++++++ 3 files changed, 140 insertions(+) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 05f7056be82..08209890802 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3074,6 +3074,18 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask3_vfmsub_sd : + GCCBuiltin<"__builtin_ia32_vfmsubsd3_mask3">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, + llvm_i32_ty], [IntrNoMem]>; + + def int_x86_avx512_mask3_vfmsub_ss : + GCCBuiltin<"__builtin_ia32_vfmsubss3_mask3">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask3_vfmsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfmsubpd128_mask3">, Intrinsic<[llvm_v2f64_ty], @@ -3182,6 +3194,18 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask3_vfnmsub_sd : + GCCBuiltin<"__builtin_ia32_vfnmsubsd3_mask3">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, + llvm_i32_ty], [IntrNoMem]>; + + def int_x86_avx512_mask3_vfnmsub_ss : + GCCBuiltin<"__builtin_ia32_vfnmsubss3_mask3">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_vfnmsub_pd_128 : GCCBuiltin<"__builtin_ia32_vfnmsubpd128_mask">, Intrinsic<[llvm_v2f64_ty], diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index fa2f613468a..a8d3a6c9cb5 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -1521,6 +1521,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB, X86ISD::FMSUB_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUB_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUB_RND, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), @@ -1539,6 +1541,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB, X86ISD::FNMSUB_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUB_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUB_RND, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ, diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 0c739763f54..711a6e6c297 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -5694,6 +5694,118 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo ret <4 x float> %res6 } +declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_sd: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm4 +; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm5 +; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vfmsub213sd {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ss: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm4 +; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm5 +; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vfmsub213ss {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + +declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) + +define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_sd: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm4 +; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm5 +; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vfnmsub213sd {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddpd %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) + %res2 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 3) + %res3 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 3) + %res4 = fadd <2 x double> %res, %res1 + %res5 = fadd <2 x double> %res2, %res3 + %res6 = fadd <2 x double> %res4, %res5 + ret <2 x double> %res6 +} + +declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3,i32 %x4 ){ +; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ss: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovaps %xmm2, %xmm3 +; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm4 +; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm4 +; CHECK-NEXT: vmovaps %xmm2, %xmm5 +; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vfnmsub213ss {rz-sae}, %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 +; CHECK-NEXT: vaddps %xmm5, %xmm1, %xmm1 +; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) + %res2 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 3) + %res3 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 3) + %res4 = fadd <4 x float> %res, %res1 + %res5 = fadd <4 x float> %res2, %res3 + %res6 = fadd <4 x float> %res4, %res5 + ret <4 x float> %res6 +} + define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss_rm(<4 x float> %x0, <4 x float> %x1, float *%ptr_b ,i8 %x3,i32 %x4) { ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ss_rm: ; CHECK: ## BB#0: -- 2.40.0