From: Craig Topper <craig.topper@gmail.com>
Date: Sat, 18 Feb 2017 20:14:20 +0000 (+0000)
Subject: Revert "[X86] Remove XOP VPCMOV intrinsics and autoupgrade them to native IR."
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=b3d03a9308e76b5a396f05b9abbabec37beb3499;p=llvm

Revert "[X86] Remove XOP VPCMOV intrinsics and autoupgrade them to native IR."

This reverts r295564. I missed that clang was still using the intrinsics despite our half implemented autoupgrade support.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295565 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 03de14e95b9..97e0ff6d39c 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -3033,6 +3033,17 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_xop_vfrcz_ps_256 : GCCBuiltin<"__builtin_ia32_vfrczps256">,
               Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 
+  def int_x86_xop_vpcmov :
+              GCCBuiltin<"__builtin_ia32_vpcmov">,
+              Intrinsic<[llvm_v2i64_ty],
+                        [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+                        [IntrNoMem]>;
+  def int_x86_xop_vpcmov_256 :
+              GCCBuiltin<"__builtin_ia32_vpcmov_256">,
+              Intrinsic<[llvm_v4i64_ty],
+                        [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty],
+                        [IntrNoMem]>;
+
   def int_x86_xop_vpcomb : GCCBuiltin<"__builtin_ia32_vpcomb">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
                          llvm_i8_ty], [IntrNoMem]>;
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index 374ebffabf1..977a61f0542 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -230,7 +230,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name.startswith("avx2.pblendd.") || // Added in 3.7
       Name.startswith("avx.vbroadcastf128") || // Added in 4.0
       Name == "avx2.vbroadcasti128" || // Added in 3.7
-      Name.startswith("xop.vpcmov") || // Added in 3.8
+      Name == "xop.vpcmov" || // Added in 3.8
       Name.startswith("avx512.mask.move.s") || // Added in 4.0
       (Name.startswith("xop.vpcom") && // Added in 3.2
        F->arg_size() == 2))
@@ -1078,7 +1078,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep =
           Builder.CreateCall(VPCOM, {CI->getArgOperand(0), CI->getArgOperand(1),
                                      Builder.getInt8(Imm)});
-    } else if (IsX86 && Name.startswith("xop.vpcmov")) {
+    } else if (IsX86 && Name == "xop.vpcmov") {
       Value *Sel = CI->getArgOperand(2);
       Value *NotSel = Builder.CreateNot(Sel);
       Value *Sel0 = Builder.CreateAnd(CI->getArgOperand(0), Sel);
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 01cacb44b82..b259a539943 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1777,7 +1777,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
 
     // XOP foldable instructions
     { X86::VPCMOVrrr,         X86::VPCMOVrmr,           0 },
-    { X86::VPCMOVYrrr,        X86::VPCMOVYrmr,          0 },
+    { X86::VPCMOVrrrY,        X86::VPCMOVrmrY,          0 },
     { X86::VPCOMBri,          X86::VPCOMBmi,            0 },
     { X86::VPCOMDri,          X86::VPCOMDmi,            0 },
     { X86::VPCOMQri,          X86::VPCOMQmi,            0 },
@@ -2478,7 +2478,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
 
     // XOP foldable instructions
     { X86::VPCMOVrrr,             X86::VPCMOVrrm,             0 },
-    { X86::VPCMOVYrrr,            X86::VPCMOVYrrm,            0 },
+    { X86::VPCMOVrrrY,            X86::VPCMOVrrmY,            0 },
     { X86::VPERMIL2PDrr,          X86::VPERMIL2PDrm,          0 },
     { X86::VPERMIL2PDrrY,         X86::VPERMIL2PDrmY,         0 },
     { X86::VPERMIL2PSrr,          X86::VPERMIL2PSrm,          0 },
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index bd8a700595c..10acdca2df7 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -290,41 +290,84 @@ let ExeDomain = SSEPackedInt in {
 }
 
 // Instruction where either second or third source can be memory
-multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
-                       X86MemOperand x86memop, ValueType VT> {
+multiclass xop4op_int<bits<8> opc, string OpcodeStr,
+                      Intrinsic Int128, Intrinsic Int256> {
   // 128-bit Instruction
-  def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs RC:$dst),
-            (ins RC:$src1, RC:$src2, RC:$src3),
+  def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, VR128:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
-                                   (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V;
-  def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst),
-            (ins RC:$src1, RC:$src2, x86memop:$src3),
+            [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>,
+            XOP_4V;
+  def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, i128mem:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1),
-                                   (X86andnp (load addr:$src3), RC:$src2))))]>,
+            [(set VR128:$dst,
+              (Int128 VR128:$src1, VR128:$src2,
+               (bitconvert (loadv2i64 addr:$src3))))]>,
             XOP_4V, VEX_W;
-  def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
-            (ins RC:$src1, x86memop:$src2, RC:$src3),
+  def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
+            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
-                                   (X86andnp RC:$src3, (load addr:$src2)))))]>,
+            [(set VR128:$dst,
+              (Int128 VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
+               VR128:$src3))]>,
             XOP_4V;
   // For disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs RC:$dst),
-            (ins RC:$src1, RC:$src2, RC:$src3),
+  def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
+            (ins VR128:$src1, VR128:$src2, VR128:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             []>, XOP_4V, VEX_W;
+
+  // 256-bit Instruction
+  def rrrY : IXOPi8Reg<opc, MRMSrcReg, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2, VR256:$src3),
+             !strconcat(OpcodeStr,
+             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+             [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>,
+             XOP_4V, VEX_L;
+  def rrmY : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+             !strconcat(OpcodeStr,
+             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+             [(set VR256:$dst,
+               (Int256 VR256:$src1, VR256:$src2,
+               (bitconvert (loadv4i64 addr:$src3))))]>,
+             XOP_4V, VEX_W, VEX_L;
+  def rmrY : IXOPi8Reg<opc, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+             !strconcat(OpcodeStr,
+             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+             [(set VR256:$dst,
+               (Int256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2)),
+                VR256:$src3))]>,
+             XOP_4V, VEX_L;
+  // For disassembler
+  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
+  def rrrY_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR256:$dst),
+            (ins VR256:$src1, VR256:$src2, VR256:$src3),
+            !strconcat(OpcodeStr,
+            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+            []>, XOP_4V, VEX_W, VEX_L;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64>;
-  defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64>, VEX_L;
+  defm VPCMOV : xop4op_int<0xA2, "vpcmov",
+                           int_x86_xop_vpcmov, int_x86_xop_vpcmov_256>;
+}
+
+let Predicates = [HasXOP] in {
+  def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1),
+                       (X86andnp VR128:$src3, VR128:$src2))),
+            (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+  def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1),
+                       (X86andnp VR256:$src3, VR256:$src2))),
+            (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
 }
 
 multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
diff --git a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
index a100a1425dd..a9287e7d8c9 100644
--- a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
@@ -499,22 +499,12 @@ declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
 define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
 ; X32-LABEL: test_mm256_cmov_si256:
 ; X32:       # BB#0:
-; X32-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; X32-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
-; X32-NEXT:    vxorps %ymm3, %ymm2, %ymm3
-; X32-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; X32-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; X32-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_cmov_si256:
 ; X64:       # BB#0:
-; X64-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; X64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
-; X64-NEXT:    vxorps %ymm3, %ymm2, %ymm3
-; X64-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; X64-NEXT:    vandps %ymm3, %ymm1, %ymm1
-; X64-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2)
   ret <4 x i64> %res
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
index 2369beffb6b..6fba72f2681 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
@@ -725,42 +725,3 @@ define <8 x i16> @test_int_x86_xop_vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16>, <8 x i16>) nounwind readnone
-
-define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
-; CHECK-LABEL: test_int_x86_xop_vpcmov:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
-  %res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) ;
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
-
-define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
-; CHECK-LABEL: test_int_x86_xop_vpcmov_256:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
-  %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ;
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1, <4 x i64> %a2) {
-; CHECK-LABEL: test_int_x86_xop_vpcmov_256_mr:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpcmov %ymm1, (%rdi), %ymm0, %ymm0
-; CHECK-NEXT:    retq
-  %vec = load <4 x i64>, <4 x i64>* %a1
-  %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %vec, <4 x i64> %a2) ;
-  ret <4 x i64> %res
-}
-define <4 x i64> @test_int_x86_xop_vpcmov_256_rm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>* %a2) {
-; CHECK-LABEL: test_int_x86_xop_vpcmov_256_rm:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0
-; CHECK-NEXT:    retq
- %vec = load <4 x i64>, <4 x i64>* %a2
- %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %vec) ;
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone
-
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64.ll b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
index 76286a26ffa..bb6ef50cdc6 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
@@ -82,23 +82,18 @@ define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-  %1 = xor <2 x i64> %a2, <i64 -1, i64 -1>
-  %2 = and <2 x i64> %a0, %a2
-  %3 = and <2 x i64> %a1, %1
-  %4 = or <2 x i64> %2, %3
-  ret <2 x i64> %4
+  %res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) ;
+  ret <2 x i64> %res
 }
+declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
 
 define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpcmov_256:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
-  %1 = xor <4 x i64> %a2, <i64 -1, i64 -1, i64 -1, i64 -1>
-  %2 = and <4 x i64> %a0, %a2
-  %3 = and <4 x i64> %a1, %1
-  %4 = or <4 x i64> %2, %3
-  ret <4 x i64> %4
+  %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ;
+  ret <4 x i64> %res
 }
 define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1, <4 x i64> %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpcmov_256_mr:
@@ -106,24 +101,19 @@ define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1,
 ; CHECK-NEXT:    vpcmov %ymm1, (%rdi), %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %a1
-  %1 = xor <4 x i64> %a2, <i64 -1, i64 -1, i64 -1, i64 -1>
-  %2 = and <4 x i64> %a0, %a2
-  %3 = and <4 x i64> %vec, %1
-  %4 = or <4 x i64> %2, %3
-  ret <4 x i64> %4
+  %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %vec, <4 x i64> %a2) ;
+  ret <4 x i64> %res
 }
 define <4 x i64> @test_int_x86_xop_vpcmov_256_rm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>* %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpcmov_256_rm:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
-  %vec = load <4 x i64>, <4 x i64>* %a2
-  %1 = xor <4 x i64> %vec, <i64 -1, i64 -1, i64 -1, i64 -1>
-  %2 = and <4 x i64> %a0, %vec
-  %3 = and <4 x i64> %a1, %1
-  %4 = or <4 x i64> %2, %3
-  ret <4 x i64> %4
+ %vec = load <4 x i64>, <4 x i64>* %a2
+ %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %vec) ;
+  ret <4 x i64> %res
 }
+declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vphaddbd(<16 x i8> %a0) {
 ; CHECK-LABEL: test_int_x86_xop_vphaddbd: