From: Sanjay Patel Date: Thu, 16 Jun 2016 15:48:30 +0000 (+0000) Subject: [x86] autoupgrade and remove SSE2/SSE41 integer min/max intrinsics X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=2c8c45d6950ac31f4e927c240ad663100b622999;p=llvm [x86] autoupgrade and remove SSE2/SSE41 integer min/max intrinsics Follow-up to: http://reviews.llvm.org/rL272806 http://reviews.llvm.org/rL272807 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@272907 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 75b0d64643a..ca169b1c208 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -406,18 +406,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_pmaxu_b : - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_pmaxs_w : - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_pminu_b : - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_pmins_w : - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem, Commutative]>; def int_x86_sse2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw128">, Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem, Commutative]>; @@ -735,34 +723,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". [IntrNoMem]>; } -// Vector compare, min, max -let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse41_pmaxsb : - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], - [IntrNoMem, Commutative]>; - def int_x86_sse41_pmaxsd : - Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], - [IntrNoMem, Commutative]>; - def int_x86_sse41_pmaxud : - Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], - [IntrNoMem, Commutative]>; - def int_x86_sse41_pmaxuw : - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], - [IntrNoMem, Commutative]>; - def int_x86_sse41_pminsb : - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], - [IntrNoMem, Commutative]>; - def int_x86_sse41_pminsd : - Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], - [IntrNoMem, Commutative]>; - def int_x86_sse41_pminud : - Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], - [IntrNoMem, Commutative]>; - def int_x86_sse41_pminuw : - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], - [IntrNoMem, Commutative]>; -} - // Advanced Encryption Standard (AES) Instructions let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_aesni_aesimc : GCCBuiltin<"__builtin_ia32_aesimc128">, diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 6e4cced62e1..f13009a4b4e 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -174,6 +174,18 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name.startswith("x86.sse2.pcmpgt.") || Name.startswith("x86.avx2.pcmpeq.") || Name.startswith("x86.avx2.pcmpgt.") || + Name == "x86.sse41.pmaxsb" || + Name == "x86.sse2.pmaxs.w" || + Name == "x86.sse41.pmaxsd" || + Name == "x86.sse2.pmaxu.b" || + Name == "x86.sse41.pmaxuw" || + Name == "x86.sse41.pmaxud" || + Name == "x86.sse41.pminsb" || + Name == "x86.sse2.pmins.w" || + Name == "x86.sse41.pminsd" || + Name == "x86.sse2.pminu.b" || + Name == "x86.sse41.pminuw" || + Name == "x86.sse41.pminud" || Name.startswith("x86.avx2.vbroadcast") || Name.startswith("x86.avx2.pbroadcast") || Name.startswith("x86.avx.vpermil.") || @@ -518,6 +530,14 @@ static Value *UpgradeMaskedLoad(IRBuilder<> &Builder, LLVMContext &C, return Builder.CreateMaskedLoad(Ptr, Align, Mask, Passthru); } +static Value *upgradeIntMinMax(IRBuilder<> &Builder, CallInst &CI, + ICmpInst::Predicate Pred) { + Value *Op0 = CI.getArgOperand(0); + Value *Op1 = CI.getArgOperand(1); + Value *Cmp = Builder.CreateICmp(Pred, Op0, Op1); + return Builder.CreateSelect(Cmp, Op0, Op1); +} + /// Upgrade a call to an old intrinsic. All argument and return casting must be /// provided to seamlessly integrate with existing context. void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { @@ -544,6 +564,22 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateICmpSGT(CI->getArgOperand(0), CI->getArgOperand(1), "pcmpgt"); Rep = Builder.CreateSExt(Rep, CI->getType(), ""); + } else if (Name == "llvm.x86.sse41.pmaxsb" || + Name == "llvm.x86.sse2.pmaxs.w" || + Name == "llvm.x86.sse41.pmaxsd") { + Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_SGT); + } else if (Name == "llvm.x86.sse2.pmaxu.b" || + Name == "llvm.x86.sse41.pmaxuw" || + Name == "llvm.x86.sse41.pmaxud") { + Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_UGT); + } else if (Name == "llvm.x86.sse41.pminsb" || + Name == "llvm.x86.sse2.pmins.w" || + Name == "llvm.x86.sse41.pminsd") { + Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_SLT); + } else if (Name == "llvm.x86.sse2.pminu.b" || + Name == "llvm.x86.sse41.pminuw" || + Name == "llvm.x86.sse41.pminud") { + Rep = upgradeIntMinMax(Builder, *CI, ICmpInst::ICMP_ULT); } else if (Name == "llvm.x86.sse2.cvtdq2pd" || Name == "llvm.x86.sse2.cvtps2pd" || Name == "llvm.x86.avx.cvtdq2.pd.256" || diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index d07a42b7011..8fab49cf449 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -2104,10 +2104,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(sse2_pmaxs_w, INTR_TYPE_2OP, ISD::SMAX, 0), - X86_INTRINSIC_DATA(sse2_pmaxu_b, INTR_TYPE_2OP, ISD::UMAX, 0), - X86_INTRINSIC_DATA(sse2_pmins_w, INTR_TYPE_2OP, ISD::SMIN, 0), - X86_INTRINSIC_DATA(sse2_pminu_b, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), X86_INTRINSIC_DATA(sse2_pmulhu_w, INTR_TYPE_2OP, ISD::MULHU, 0), @@ -2146,14 +2142,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(sse41_pmaxsb, INTR_TYPE_2OP, ISD::SMAX, 0), - X86_INTRINSIC_DATA(sse41_pmaxsd, INTR_TYPE_2OP, ISD::SMAX, 0), - X86_INTRINSIC_DATA(sse41_pmaxud, INTR_TYPE_2OP, ISD::UMAX, 0), - X86_INTRINSIC_DATA(sse41_pmaxuw, INTR_TYPE_2OP, ISD::UMAX, 0), - X86_INTRINSIC_DATA(sse41_pminsb, INTR_TYPE_2OP, ISD::SMIN, 0), - X86_INTRINSIC_DATA(sse41_pminsd, INTR_TYPE_2OP, ISD::SMIN, 0), - X86_INTRINSIC_DATA(sse41_pminud, INTR_TYPE_2OP, ISD::UMIN, 0), - X86_INTRINSIC_DATA(sse41_pminuw, INTR_TYPE_2OP, ISD::UMIN, 0), X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0), X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0), diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll index f3573e93ac2..a7bb4e5bfe1 100644 --- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -144,8 +144,8 @@ define <4 x i32> @test_x86_sse2_pshuf_d(<4 x i32> %a) { ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] ; CHECK-NEXT: retl entry: - %res = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) nounwind readnone - ret <4 x i32> %res + %res = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) nounwind readnone + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) nounwind readnone @@ -155,8 +155,8 @@ define <8 x i16> @test_x86_sse2_pshufl_w(<8 x i16> %a) { ; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7] ; CHECK-NEXT: retl entry: - %res = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) nounwind readnone - ret <8 x i16> %res + %res = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) nounwind readnone + ret <8 x i16> %res } declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) nounwind readnone @@ -166,7 +166,52 @@ define <8 x i16> @test_x86_sse2_pshufh_w(<8 x i16> %a) { ; CHECK-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4] ; CHECK-NEXT: retl entry: - %res = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) nounwind readnone - ret <8 x i16> %res + %res = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) nounwind readnone + ret <8 x i16> %res } declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) nounwind readnone + +define <16 x i8> @max_epu8(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: max_epu8: +; CHECK: ## BB#0: +; CHECK-NEXT: pmaxub %xmm1, %xmm0 +; CHECK-NEXT: retl +; + %res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @min_epu8(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: min_epu8: +; CHECK: ## BB#0: +; CHECK-NEXT: pminub %xmm1, %xmm0 +; CHECK-NEXT: retl +; + %res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @max_epi16(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: max_epi16: +; CHECK: ## BB#0: +; CHECK-NEXT: pmaxsw %xmm1, %xmm0 +; CHECK-NEXT: retl +; + %res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @min_epi16(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: min_epi16: +; CHECK: ## BB#0: +; CHECK-NEXT: pminsw %xmm1, %xmm0 +; CHECK-NEXT: retl +; + %res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone + diff --git a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll index 72bf4395bb9..4f6aa798faf 100644 --- a/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll +++ b/test/CodeGen/X86/sse41-intrinsics-x86-upgrade.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse4.1 | FileCheck %s ; This test works just like the non-upgrade one except that it only checks @@ -211,3 +211,92 @@ define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) { ret <2 x i64> %res } declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone + +define <16 x i8> @max_epi8(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: max_epi8: +; CHECK: ## BB#0: +; CHECK-NEXT: pmaxsb %xmm1, %xmm0 +; CHECK-NEXT: retl +; + %res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone + +define <16 x i8> @min_epi8(<16 x i8> %a0, <16 x i8> %a1) { +; CHECK-LABEL: min_epi8: +; CHECK: ## BB#0: +; CHECK-NEXT: pminsb %xmm1, %xmm0 +; CHECK-NEXT: retl +; + %res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone + +define <8 x i16> @max_epu16(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: max_epu16: +; CHECK: ## BB#0: +; CHECK-NEXT: pmaxuw %xmm1, %xmm0 +; CHECK-NEXT: retl +; + %res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone + +define <8 x i16> @min_epu16(<8 x i16> %a0, <8 x i16> %a1) { +; CHECK-LABEL: min_epu16: +; CHECK: ## BB#0: +; CHECK-NEXT: pminuw %xmm1, %xmm0 +; CHECK-NEXT: retl +; + %res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone + +define <4 x i32> @max_epi32(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: max_epi32: +; CHECK: ## BB#0: +; CHECK-NEXT: pmaxsd %xmm1, %xmm0 +; CHECK-NEXT: retl +; + %res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone + +define <4 x i32> @min_epi32(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: min_epi32: +; CHECK: ## BB#0: +; CHECK-NEXT: pminsd %xmm1, %xmm0 +; CHECK-NEXT: retl +; + %res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone + +define <4 x i32> @max_epu32(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: max_epu32: +; CHECK: ## BB#0: +; CHECK-NEXT: pmaxud %xmm1, %xmm0 +; CHECK-NEXT: retl +; + %res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone + +define <4 x i32> @min_epu32(<4 x i32> %a0, <4 x i32> %a1) { +; CHECK-LABEL: min_epu32: +; CHECK: ## BB#0: +; CHECK-NEXT: pminud %xmm1, %xmm0 +; CHECK-NEXT: retl +; + %res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone +