From bdfe15d215b66b4f85bd54740ac26214e5a06287 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 14 Jan 2019 00:03:50 +0000 Subject: [PATCH] [X86] Remove mask parameter from vpshufbitqmb intrinsics. Change result to a vXi1 vector. The input mask can be represented with an AND in IR. Fixes PR40258 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@351028 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 21 +++------- lib/IR/AutoUpgrade.cpp | 15 +++++++ lib/Target/X86/X86ISelLowering.cpp | 34 --------------- lib/Target/X86/X86IntrinsicsInfo.h | 12 ++---- test/CodeGen/X86/vpshufbitqbm-intrinsics.ll | 46 +++++++++++++-------- 5 files changed, 53 insertions(+), 75 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 91ad297666b..4a1e5edeaa6 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -1312,21 +1312,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // BITALG bits shuffle let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx512_mask_vpshufbitqmb_128 : - GCCBuiltin<"__builtin_ia32_vpshufbitqmb128_mask">, - Intrinsic<[llvm_i16_ty], - [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty], - [IntrNoMem]>; - def int_x86_avx512_mask_vpshufbitqmb_256 : - GCCBuiltin<"__builtin_ia32_vpshufbitqmb256_mask">, - Intrinsic<[llvm_i32_ty], - [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], - [IntrNoMem]>; - def int_x86_avx512_mask_vpshufbitqmb_512 : - GCCBuiltin<"__builtin_ia32_vpshufbitqmb512_mask">, - Intrinsic<[llvm_i64_ty], - [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], - [IntrNoMem]>; + def int_x86_avx512_vpshufbitqmb_128 : + Intrinsic<[llvm_v16i1_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; + def int_x86_avx512_vpshufbitqmb_256 : + Intrinsic<[llvm_v32i1_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; + def int_x86_avx512_vpshufbitqmb_512 : + Intrinsic<[llvm_v64i1_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 02cad5c71d1..ff003a7addd 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -298,6 +298,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.mask.max.p") || // Added in 7.0. 128/256 in 5.0 Name.startswith("avx512.mask.min.p") || // Added in 7.0. 128/256 in 5.0 Name.startswith("avx512.mask.fpclass.p") || // Added in 7.0 + Name.startswith("avx512.mask.vpshufbitqmb.") || // Added in 8.0 Name == "sse.cvtsi2ss" || // Added in 7.0 Name == "sse.cvtsi642ss" || // Added in 7.0 Name == "sse2.cvtsi2sd" || // Added in 7.0 @@ -1758,6 +1759,20 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { // "avx512.mask.pcmpeq." or "avx512.mask.pcmpgt." bool CmpEq = Name[16] == 'e'; Rep = upgradeMaskedCompare(Builder, *CI, CmpEq ? 0 : 6, true); + } else if (IsX86 && Name.startswith("avx512.mask.vpshufbitqmb.")) { + Type *OpTy = CI->getArgOperand(0)->getType(); + unsigned VecWidth = OpTy->getPrimitiveSizeInBits(); + Intrinsic::ID IID; + switch (VecWidth) { + default: llvm_unreachable("Unexpected intrinsic"); + case 128: IID = Intrinsic::x86_avx512_vpshufbitqmb_128; break; + case 256: IID = Intrinsic::x86_avx512_vpshufbitqmb_256; break; + case 512: IID = Intrinsic::x86_avx512_vpshufbitqmb_512; break; + } + + Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), + { CI->getOperand(0), CI->getArgOperand(1) }); + Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2)); } else if (IsX86 && Name.startswith("avx512.mask.fpclass.p")) { Type *OpTy = CI->getArgOperand(0)->getType(); unsigned VecWidth = OpTy->getPrimitiveSizeInBits(); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ed1672cd4e9..bcf3e51e2ca 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -21378,14 +21378,6 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - switch (Op.getOpcode()) { - default: break; - case X86ISD::CMPM: - case X86ISD::CMPM_RND: - case X86ISD::VPSHUFBITQMB: - case X86ISD::VFPCLASS: - return DAG.getNode(ISD::AND, dl, VT, Op, VMask); - } if (PreservedSrc.isUndef()) PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); @@ -21841,32 +21833,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, FPclassMask, DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(MVT::i8, Ins); } - case CMP_MASK: { - // Comparison intrinsics with masks. - // Example of transformation: - // (i8 (int_x86_avx512_mask_pcmpeq_q_128 - // (v2i64 %a), (v2i64 %b), (i8 %mask))) -> - // (i8 (bitcast - // (v8i1 (insert_subvector zero, - // (v2i1 (and (PCMPEQM %a, %b), - // (extract_subvector - // (v8i1 (bitcast %mask)), 0))), 0)))) - MVT VT = Op.getOperand(1).getSimpleValueType(); - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3); - MVT BitcastVT = MVT::getVectorVT(MVT::i1, - Mask.getSimpleValueType().getSizeInBits()); - SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), - Op.getOperand(2)); - SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(), - Subtarget, DAG); - // Need to fill with zeros to ensure the bitcast will produce zeroes - // for the upper bits in the v2i1/v4i1 case. - SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, - DAG.getConstant(0, dl, BitcastVT), - CmpMask, DAG.getIntPtrConstant(0, dl)); - return DAG.getBitcast(Op.getValueType(), Res); - } case CMP_MASK_CC: { MVT MaskVT = Op.getSimpleValueType(); diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index c9cfd306593..efdd757bec4 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -23,7 +23,7 @@ enum IntrinsicType : uint16_t { GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASS, FPCLASSS, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP, INTR_TYPE_2OP_IMM8, INTR_TYPE_3OP_IMM8, - CMP_MASK, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, + CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, CVTPD2PS, CVTPD2PS_MASK, CVTPD2PS_RND_MASK, INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, INTR_TYPE_2OP_MASK_RM, @@ -845,13 +845,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, CVTPS2PH_MASK, X86ISD::CVTPS2PH, X86ISD::MCVTPS2PH), - X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_128, CMP_MASK, - X86ISD::VPSHUFBITQMB, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_256, CMP_MASK, - X86ISD::VPSHUFBITQMB, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_512, CMP_MASK, - X86ISD::VPSHUFBITQMB, 0), - X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ, @@ -1029,6 +1022,9 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_128 , IFMA_OP, X86ISD::VPMADD52L, 0), X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_256 , IFMA_OP, X86ISD::VPMADD52L, 0), X86_INTRINSIC_DATA(avx512_vpmadd52l_uq_512 , IFMA_OP, X86ISD::VPMADD52L, 0), + X86_INTRINSIC_DATA(avx512_vpshufbitqmb_128, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0), + X86_INTRINSIC_DATA(avx512_vpshufbitqmb_256, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0), + X86_INTRINSIC_DATA(avx512_vpshufbitqmb_512, INTR_TYPE_2OP, X86ISD::VPSHUFBITQMB, 0), X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0), diff --git a/test/CodeGen/X86/vpshufbitqbm-intrinsics.ll b/test/CodeGen/X86/vpshufbitqbm-intrinsics.ll index a80ed2f174e..f83f2df0c35 100644 --- a/test/CodeGen/X86/vpshufbitqbm-intrinsics.ll +++ b/test/CodeGen/X86/vpshufbitqbm-intrinsics.ll @@ -1,41 +1,51 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512bitalg,+avx512vl | FileCheck %s -declare i16 @llvm.x86.avx512.mask.vpshufbitqmb.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) -define i16 @test_vpshufbitqmb_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +define i16 @test_vpshufbitqmb_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) { ; CHECK-LABEL: test_vpshufbitqmb_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpshufbitqmb %xmm1, %xmm0, %k0 {%k1} +; CHECK-NEXT: vpshufbitqmb %xmm1, %xmm0, %k1 +; CHECK-NEXT: vpshufbitqmb %xmm3, %xmm2, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq - %res = call i16 @llvm.x86.avx512.mask.vpshufbitqmb.128(<16 x i8> %a, <16 x i8> %b, i16 %mask) - ret i16 %res + %tmp = call <16 x i1> @llvm.x86.avx512.vpshufbitqmb.128(<16 x i8> %a, <16 x i8> %b) + %tmp1 = call <16 x i1> @llvm.x86.avx512.vpshufbitqmb.128(<16 x i8> %c, <16 x i8> %d) + %tmp2 = and <16 x i1> %tmp, %tmp1 + %tmp3 = bitcast <16 x i1> %tmp2 to i16 + ret i16 %tmp3 } -declare i32 @llvm.x86.avx512.mask.vpshufbitqmb.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) -define i32 @test_vpshufbitqmb_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +define i32 @test_vpshufbitqmb_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) { ; CHECK-LABEL: test_vpshufbitqmb_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vpshufbitqmb %ymm1, %ymm0, %k0 {%k1} +; CHECK-NEXT: vpshufbitqmb %ymm1, %ymm0, %k1 +; CHECK-NEXT: vpshufbitqmb %ymm3, %ymm2, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - %res = call i32 @llvm.x86.avx512.mask.vpshufbitqmb.256(<32 x i8> %a, <32 x i8> %b, i32 %mask) - ret i32 %res + %tmp = call <32 x i1> @llvm.x86.avx512.vpshufbitqmb.256(<32 x i8> %a, <32 x i8> %b) + %tmp1 = call <32 x i1> @llvm.x86.avx512.vpshufbitqmb.256(<32 x i8> %c, <32 x i8> %d) + %tmp2 = and <32 x i1> %tmp, %tmp1 + %tmp3 = bitcast <32 x i1> %tmp2 to i32 + ret i32 %tmp3 } -declare i64 @llvm.x86.avx512.mask.vpshufbitqmb.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) -define i64 @test_vpshufbitqmb_512(<64 x i8> %a, <64 x i8> %b, i64 %mask) { +define i64 @test_vpshufbitqmb_512(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) { ; CHECK-LABEL: test_vpshufbitqmb_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovq %rdi, %k1 -; CHECK-NEXT: vpshufbitqmb %zmm1, %zmm0, %k0 {%k1} +; CHECK-NEXT: vpshufbitqmb %zmm1, %zmm0, %k1 +; CHECK-NEXT: vpshufbitqmb %zmm3, %zmm2, %k0 {%k1} ; CHECK-NEXT: kmovq %k0, %rax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq - %res = call i64 @llvm.x86.avx512.mask.vpshufbitqmb.512(<64 x i8> %a, <64 x i8> %b, i64 %mask) - ret i64 %res + %tmp = call <64 x i1> @llvm.x86.avx512.vpshufbitqmb.512(<64 x i8> %a, <64 x i8> %b) + %tmp1 = call <64 x i1> @llvm.x86.avx512.vpshufbitqmb.512(<64 x i8> %c, <64 x i8> %d) + %tmp2 = and <64 x i1> %tmp, %tmp1 + %tmp3 = bitcast <64 x i1> %tmp2 to i64 + ret i64 %tmp3 } + +declare <16 x i1> @llvm.x86.avx512.vpshufbitqmb.128(<16 x i8>, <16 x i8>) +declare <32 x i1> @llvm.x86.avx512.vpshufbitqmb.256(<32 x i8>, <32 x i8>) +declare <64 x i1> @llvm.x86.avx512.vpshufbitqmb.512(<64 x i8>, <64 x i8>) -- 2.50.1