From: Craig Topper Date: Fri, 24 Feb 2017 05:35:04 +0000 (+0000) Subject: [AVX-512] Remove lzcnt intrinsics and autoupgrade them to generic ctlz intrinsics... X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=474a418aa717d48d32974afc858aeb6ce7cfa9ba;p=llvm [AVX-512] Remove lzcnt intrinsics and autoupgrade them to generic ctlz intrinsics with select. Clang has been emitting cltz intrinsics for a while now. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296091 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 03de14e95b9..019760cf97b 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -5422,32 +5422,6 @@ let TargetPrefix = "x86" in { Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_lzcnt_d_128 : - Intrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], - [IntrNoMem]>; - def int_x86_avx512_mask_lzcnt_d_256 : - Intrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], - [IntrNoMem]>; - def int_x86_avx512_mask_lzcnt_d_512 : - Intrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_lzcnt_q_128 : - Intrinsic<[llvm_v2i64_ty], - [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], - [IntrNoMem]>; - def int_x86_avx512_mask_lzcnt_q_256 : - Intrinsic<[llvm_v4i64_ty], - [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], - [IntrNoMem]>; - def int_x86_avx512_mask_lzcnt_q_512 : - Intrinsic<[llvm_v8i64_ty], - [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], - [IntrNoMem]>; } // Compares diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 97c91548cad..544ff271e0f 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -188,6 +188,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx2.pmovzx") || // Added in 3.9 Name.startswith("avx512.mask.pmovsx") || // Added in 4.0 Name.startswith("avx512.mask.pmovzx") || // Added in 4.0 + Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0 Name == "sse2.cvtdq2pd" || // Added in 3.9 Name == "sse2.cvtps2pd" || // Added in 3.9 Name == "avx.cvtdq2.pd.256" || // Added in 3.9 @@ -1517,6 +1518,13 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1)); Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); + } else if (IsX86 && Name.startswith("avx512.mask.lzcnt.")) { + Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), + Intrinsic::ctlz, + CI->getType()), + { CI->getArgOperand(0), Builder.getInt1(false) }); + Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, + CI->getArgOperand(1)); } else if (IsX86 && (Name.startswith("avx512.mask.max.p") || Name.startswith("avx512.mask.min.p"))) { bool IsMin = Name[13] == 'i'; diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 151bda487c1..a927b3b437c 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -795,18 +795,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VGETMANTS, 0), X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM, X86ISD::VGETMANTS, 0), - X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK, - ISD::CTLZ, 0), - X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK, - ISD::CTLZ, 0), - X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK, - ISD::CTLZ, 0), - X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK, - ISD::CTLZ, 0), - X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK, - ISD::CTLZ, 0), - X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK, - ISD::CTLZ, 0), X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, X86ISD::FMAX_RND), X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index 8ad6906a5b6..4e95cffd4d0 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -641,15 +641,6 @@ define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) { } declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly -define <16 x i32> @test_conflict_d(<16 x i32> %a) { -; CHECK-LABEL: test_conflict_d: -; CHECK: ## BB#0: -; CHECK-NEXT: vpconflictd %zmm0, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1) - ret <16 x i32> %res -} - define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) { ; CHECK-LABEL: test_cmpps: ; CHECK: ## BB#0: diff --git a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll new file mode 100644 index 00000000000..e5dbff9ac51 --- /dev/null +++ b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s + +define <16 x i32> @test_lzcnt_d(<16 x i32> %a) { +; CHECK-LABEL: test_lzcnt_d: +; CHECK: ## BB#0: +; CHECK-NEXT: vplzcntd %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1) + ret <16 x i32> %res +} + +declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly + +define <8 x i64> @test_lzcnt_q(<8 x i64> %a) { +; CHECK-LABEL: test_lzcnt_q: +; CHECK: ## BB#0: +; CHECK-NEXT: vplzcntq %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1) + ret <8 x i64> %res +} + +declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly + + +define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { +; CHECK-LABEL: test_mask_lzcnt_d: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vplzcntd %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) + ret <16 x i32> %res +} + +define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { +; CHECK-LABEL: test_mask_lzcnt_q: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) + ret <8 x i64> %res +} diff --git a/test/CodeGen/X86/avx512cd-intrinsics.ll b/test/CodeGen/X86/avx512cd-intrinsics.ll index 5f3d654ff95..7e5a3e8fe25 100644 --- a/test/CodeGen/X86/avx512cd-intrinsics.ll +++ b/test/CodeGen/X86/avx512cd-intrinsics.ll @@ -62,22 +62,20 @@ define <16 x i32> @test_lzcnt_d(<16 x i32> %a) { ; CHECK: ## BB#0: ; CHECK-NEXT: vplzcntd %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false) + ret <16 x i32> %1 } - -declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly +declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) #0 define <8 x i64> @test_lzcnt_q(<8 x i64> %a) { ; CHECK-LABEL: test_lzcnt_q: ; CHECK: ## BB#0: ; CHECK-NEXT: vplzcntq %zmm0, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false) + ret <8 x i64> %1 } - -declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly +declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) #0 define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; CHECK-LABEL: test_mask_lzcnt_d: @@ -86,8 +84,10 @@ define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; CHECK-NEXT: vplzcntd %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) - ret <16 x i32> %res + %1 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %b + ret <16 x i32> %3 } define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { @@ -97,6 +97,8 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) - ret <8 x i64> %res + %1 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %b + ret <8 x i64> %3 } diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll new file mode 100644 index 00000000000..8f528394f5b --- /dev/null +++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s + +declare <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vplzcntd %xmm0, %xmm2 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) + %res3 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x2) + %res2 = add <4 x i32> %res, %res1 + %res4 = add <4 x i32> %res2, %res3 + ret <4 x i32> %res4 +} + +declare <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vplzcntd %ymm0, %ymm2 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vplzcntd %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128: +; CHECK: ## BB#0: +; CHECK-NEXT: vplzcntq %xmm0, %xmm2 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vplzcntq %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256: +; CHECK: ## BB#0: +; CHECK-NEXT: vplzcntq %ymm0, %ymm2 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vplzcntq %ymm0, %ymm1 {%k1} +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll index b27b795b440..01c1df7d93c 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll @@ -1,75 +1,83 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s -declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readonly - -declare <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32>, <4 x i32>, i8) - -define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { +define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128: ; CHECK: ## BB#0: +; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm2 {%k1} {z} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} +; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: retq - %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) - %res3 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res2 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res2, %res3 + %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) + %2 = bitcast i8 %x2 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x1 + %4 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) + %5 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) + %6 = bitcast i8 %x2 to <8 x i1> + %extract = shufflevector <8 x i1> %6, <8 x i1> %6, <4 x i32> + %7 = select <4 x i1> %extract, <4 x i32> %5, <4 x i32> zeroinitializer + %res2 = add <4 x i32> %3, %4 + %res4 = add <4 x i32> %res2, %7 ret <4 x i32> %res4 } +declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) #0 -declare <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32>, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { +define <8 x i32> @test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256: ; CHECK: ## BB#0: +; CHECK-NEXT: vplzcntd %ymm0, %ymm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vplzcntd %ymm0, %ymm0 -; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: retq - %res = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1) - %res2 = add <8 x i32> %res, %res1 + %1 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %x0, i1 false) + %2 = bitcast i8 %x2 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 + %4 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %x0, i1 false) + %res2 = add <8 x i32> %3, %4 ret <8 x i32> %res2 } +declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1) #0 -declare <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64>, <2 x i64>, i8) - -define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { +define <2 x i64> @test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128: ; CHECK: ## BB#0: +; CHECK-NEXT: vplzcntq %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntq %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntq %xmm0, %xmm0 -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm0 ; CHECK-NEXT: retq - %res = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 + %1 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %x0, i1 false) + %2 = bitcast i8 %x2 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1 + %4 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %x0, i1 false) + %res2 = add <2 x i64> %3, %4 ret <2 x i64> %res2 } +declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) #0 -declare <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64>, <4 x i64>, i8) - -define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { +define <4 x i64> @test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) { ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256: ; CHECK: ## BB#0: +; CHECK-NEXT: vplzcntq %ymm0, %ymm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntq %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vplzcntq %ymm0, %ymm0 -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm0 ; CHECK-NEXT: retq - %res = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 + %1 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %x0, i1 false) + %2 = bitcast i8 %x2 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1 + %4 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %x0, i1 false) + %res2 = add <4 x i64> %3, %4 ret <4 x i64> %res2 } +declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) #0 declare <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32>, <4 x i32>, i8)