From aa2f6f93a28d2977b3dbc9856fb47c22ca7fbb94 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 18 Feb 2017 19:51:25 +0000 Subject: [PATCH] [X86] Remove XOP VPCMOV intrinsics and autoupgrade them to native IR. It seems we were already upgrading 128-bit VPCMOV, but the intrinsic was still defined and being used in isel patterns. While I was here I also simplified the tablegen multiclasses. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295564 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 11 --- lib/IR/AutoUpgrade.cpp | 4 +- lib/Target/X86/X86InstrInfo.cpp | 4 +- lib/Target/X86/X86InstrXOP.td | 79 +++++-------------- test/CodeGen/X86/xop-intrinsics-fast-isel.ll | 14 +++- .../X86/xop-intrinsics-x86_64-upgrade.ll | 39 +++++++++ test/CodeGen/X86/xop-intrinsics-x86_64.ll | 32 +++++--- 7 files changed, 94 insertions(+), 89 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 97e0ff6d39c..03de14e95b9 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -3033,17 +3033,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_xop_vfrcz_ps_256 : GCCBuiltin<"__builtin_ia32_vfrczps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; - def int_x86_xop_vpcmov : - GCCBuiltin<"__builtin_ia32_vpcmov">, - Intrinsic<[llvm_v2i64_ty], - [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], - [IntrNoMem]>; - def int_x86_xop_vpcmov_256 : - GCCBuiltin<"__builtin_ia32_vpcmov_256">, - Intrinsic<[llvm_v4i64_ty], - [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty], - [IntrNoMem]>; - def int_x86_xop_vpcomb : GCCBuiltin<"__builtin_ia32_vpcomb">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index 977a61f0542..374ebffabf1 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -230,7 +230,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx2.pblendd.") || // Added in 3.7 Name.startswith("avx.vbroadcastf128") || // Added in 4.0 Name == "avx2.vbroadcasti128" || // Added in 3.7 - Name == "xop.vpcmov" || // Added in 3.8 + Name.startswith("xop.vpcmov") || // Added in 3.8 Name.startswith("avx512.mask.move.s") || // Added in 4.0 (Name.startswith("xop.vpcom") && // Added in 3.2 F->arg_size() == 2)) @@ -1078,7 +1078,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Rep = Builder.CreateCall(VPCOM, {CI->getArgOperand(0), CI->getArgOperand(1), Builder.getInt8(Imm)}); - } else if (IsX86 && Name == "xop.vpcmov") { + } else if (IsX86 && Name.startswith("xop.vpcmov")) { Value *Sel = CI->getArgOperand(2); Value *NotSel = Builder.CreateNot(Sel); Value *Sel0 = Builder.CreateAnd(CI->getArgOperand(0), Sel); diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index b259a539943..01cacb44b82 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -1777,7 +1777,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // XOP foldable instructions { X86::VPCMOVrrr, X86::VPCMOVrmr, 0 }, - { X86::VPCMOVrrrY, X86::VPCMOVrmrY, 0 }, + { X86::VPCMOVYrrr, X86::VPCMOVYrmr, 0 }, { X86::VPCOMBri, X86::VPCOMBmi, 0 }, { X86::VPCOMDri, X86::VPCOMDmi, 0 }, { X86::VPCOMQri, X86::VPCOMQmi, 0 }, @@ -2478,7 +2478,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // XOP foldable instructions { X86::VPCMOVrrr, X86::VPCMOVrrm, 0 }, - { X86::VPCMOVrrrY, X86::VPCMOVrrmY, 0 }, + { X86::VPCMOVYrrr, X86::VPCMOVYrrm, 0 }, { X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 }, { X86::VPERMIL2PDrrY, X86::VPERMIL2PDrmY, 0 }, { X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 }, diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index 10acdca2df7..bd8a700595c 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -290,84 +290,41 @@ let ExeDomain = SSEPackedInt in { } // Instruction where either second or third source can be memory -multiclass xop4op_int opc, string OpcodeStr, - Intrinsic Int128, Intrinsic Int256> { +multiclass xop4op_int opc, string OpcodeStr, RegisterClass RC, + X86MemOperand x86memop, ValueType VT> { // 128-bit Instruction - def rrr : IXOPi8Reg, - XOP_4V; - def rrm : IXOPi8Reg, XOP_4V; + def rrm : IXOPi8Reg, + [(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1), + (X86andnp (load addr:$src3), RC:$src2))))]>, XOP_4V, VEX_W; - def rmr : IXOPi8Reg, + [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1), + (X86andnp RC:$src3, (load addr:$src2)))))]>, XOP_4V; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in - def rrr_REV : IXOPi8Reg, XOP_4V, VEX_W; - - // 256-bit Instruction - def rrrY : IXOPi8Reg, - XOP_4V, VEX_L; - def rrmY : IXOPi8Reg, - XOP_4V, VEX_W, VEX_L; - def rmrY : IXOPi8Reg, - XOP_4V, VEX_L; - // For disassembler - let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in - def rrrY_REV : IXOPi8Reg, XOP_4V, VEX_W, VEX_L; } let ExeDomain = SSEPackedInt in { - defm VPCMOV : xop4op_int<0xA2, "vpcmov", - int_x86_xop_vpcmov, int_x86_xop_vpcmov_256>; -} - -let Predicates = [HasXOP] in { - def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1), - (X86andnp VR128:$src3, VR128:$src2))), - (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>; - - def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1), - (X86andnp VR256:$src3, VR256:$src2))), - (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>; + defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64>; + defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64>, VEX_L; } multiclass xop5op opc, string OpcodeStr, SDNode OpNode, diff --git a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll index a9287e7d8c9..a100a1425dd 100644 --- a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll @@ -499,12 +499,22 @@ declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { ; X32-LABEL: test_mm256_cmov_si256: ; X32: # BB#0: -; X32-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; X32-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; X32-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; X32-NEXT: vxorps %ymm3, %ymm2, %ymm3 +; X32-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X32-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X32-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_cmov_si256: ; X64: # BB#0: -; X64-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; X64-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; X64-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 +; X64-NEXT: vxorps %ymm3, %ymm2, %ymm3 +; X64-NEXT: vandps %ymm2, %ymm0, %ymm0 +; X64-NEXT: vandps %ymm3, %ymm1, %ymm1 +; X64-NEXT: vorps %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ret <4 x i64> %res diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll index 6fba72f2681..2369beffb6b 100644 --- a/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll +++ b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll @@ -725,3 +725,42 @@ define <8 x i16> @test_int_x86_xop_vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) { ret <8 x i16> %res } declare <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16>, <8 x i16>) nounwind readnone + +define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { +; CHECK-LABEL: test_int_x86_xop_vpcmov: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) ; + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone + +define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { +; CHECK-LABEL: test_int_x86_xop_vpcmov_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ; + ret <4 x i64> %res +} +define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1, <4 x i64> %a2) { +; CHECK-LABEL: test_int_x86_xop_vpcmov_256_mr: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: retq + %vec = load <4 x i64>, <4 x i64>* %a1 + %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %vec, <4 x i64> %a2) ; + ret <4 x i64> %res +} +define <4 x i64> @test_int_x86_xop_vpcmov_256_rm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>* %a2) { +; CHECK-LABEL: test_int_x86_xop_vpcmov_256_rm: +; CHECK: # BB#0: +; CHECK-NEXT: vpcmov (%rdi), %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %vec = load <4 x i64>, <4 x i64>* %a2 + %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %vec) ; + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone + diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64.ll b/test/CodeGen/X86/xop-intrinsics-x86_64.ll index bb6ef50cdc6..76286a26ffa 100644 --- a/test/CodeGen/X86/xop-intrinsics-x86_64.ll +++ b/test/CodeGen/X86/xop-intrinsics-x86_64.ll @@ -82,18 +82,23 @@ define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64 ; CHECK: # BB#0: ; CHECK-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq - %res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) ; - ret <2 x i64> %res + %1 = xor <2 x i64> %a2, + %2 = and <2 x i64> %a0, %a2 + %3 = and <2 x i64> %a1, %1 + %4 = or <2 x i64> %2, %3 + ret <2 x i64> %4 } -declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) { ; CHECK-LABEL: test_int_x86_xop_vpcmov_256: ; CHECK: # BB#0: ; CHECK-NEXT: vpcmov %ymm2, %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq - %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ; - ret <4 x i64> %res + %1 = xor <4 x i64> %a2, + %2 = and <4 x i64> %a0, %a2 + %3 = and <4 x i64> %a1, %1 + %4 = or <4 x i64> %2, %3 + ret <4 x i64> %4 } define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1, <4 x i64> %a2) { ; CHECK-LABEL: test_int_x86_xop_vpcmov_256_mr: @@ -101,19 +106,24 @@ define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1, ; CHECK-NEXT: vpcmov %ymm1, (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <4 x i64>, <4 x i64>* %a1 - %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %vec, <4 x i64> %a2) ; - ret <4 x i64> %res + %1 = xor <4 x i64> %a2, + %2 = and <4 x i64> %a0, %a2 + %3 = and <4 x i64> %vec, %1 + %4 = or <4 x i64> %2, %3 + ret <4 x i64> %4 } define <4 x i64> @test_int_x86_xop_vpcmov_256_rm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>* %a2) { ; CHECK-LABEL: test_int_x86_xop_vpcmov_256_rm: ; CHECK: # BB#0: ; CHECK-NEXT: vpcmov (%rdi), %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq - %vec = load <4 x i64>, <4 x i64>* %a2 - %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %vec) ; - ret <4 x i64> %res + %vec = load <4 x i64>, <4 x i64>* %a2 + %1 = xor <4 x i64> %vec, + %2 = and <4 x i64> %a0, %vec + %3 = and <4 x i64> %a1, %1 + %4 = or <4 x i64> %2, %3 + ret <4 x i64> %4 } -declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone define <4 x i32> @test_int_x86_xop_vphaddbd(<16 x i8> %a0) { ; CHECK-LABEL: test_int_x86_xop_vphaddbd: -- 2.40.0