From: Kevin Qin <Kevin.Qin@arm.com>
Date: Tue, 10 Dec 2013 06:49:01 +0000 (+0000)
Subject: [AArch64 NEON] Support poly128_t and implement relevant intrinsic.
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3071f44f362397a188b0899f7f6f5e7c53377d19;p=clang

[AArch64 NEON] Support poly128_t and implement relevant intrinsic.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@196888 91177308-0d34-0410-b5e6-96231b3b80d8
---

diff --git a/include/clang/Basic/TargetBuiltins.h b/include/clang/Basic/TargetBuiltins.h
index ed3cc49fed..e2b5b2423f 100644
--- a/include/clang/Basic/TargetBuiltins.h
+++ b/include/clang/Basic/TargetBuiltins.h
@@ -91,6 +91,7 @@ namespace clang {
       Poly8,
       Poly16,
       Poly64,
+      Poly128,
       Float16,
       Float32,
       Float64
diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td
index b918459f4e..a1065d9f72 100644
--- a/include/clang/Basic/arm_neon.td
+++ b/include/clang/Basic/arm_neon.td
@@ -30,6 +30,7 @@ def OP_MUL   : Op;
 def OP_MLA   : Op;
 def OP_MLAL  : Op;
 def OP_MULLHi : Op;
+def OP_MULLHi_P64 : Op;
 def OP_MULLHi_N : Op;
 def OP_MLALHi : Op;
 def OP_MLALHi_N : Op;
@@ -224,6 +225,7 @@ class NoTestOpInst<string n, string p, string t, Op o> : Inst<n, p, t, o> {}
 // s: short
 // i: int
 // l: long
+// k: 128-bit long
 // f: float
 // h: half-float
 // d: double
@@ -603,6 +605,9 @@ def LD3_DUP  : WInst<"vld3_dup", "3c",
 def LD4_DUP  : WInst<"vld4_dup", "4c",
                     "QUcQUsQUiQUlQcQsQiQlQhQfQdQPcQPsQPlUcUsUiUlcsilhfdPcPsPl">;
 
+def VLDRQ : WInst<"vldrq", "sc", "Pk">;
+def VSTRQ : WInst<"vstrq", "vps", "Pk">;
+
 ////////////////////////////////////////////////////////////////////////////////
 // Addition
 // With additional d, Qd type.
@@ -854,6 +859,9 @@ def VQDMLAL_HIGH : SOpInst<"vqdmlal_high", "wwkk", "si", OP_QDMLALHi>;
 def VQDMLAL_HIGH_N : SOpInst<"vqdmlal_high_n", "wwks", "si", OP_QDMLALHi_N>;
 def VQDMLSL_HIGH : SOpInst<"vqdmlsl_high", "wwkk", "si", OP_QDMLSLHi>;
 def VQDMLSL_HIGH_N : SOpInst<"vqdmlsl_high_n", "wwks", "si", OP_QDMLSLHi_N>;
+def VMULL_P64    : SInst<"vmull", "rss", "Pl">;
+def VMULL_HIGH_P64 : SOpInst<"vmull_high", "rdd", "HPl", OP_MULLHi_P64>;
+
 
 ////////////////////////////////////////////////////////////////////////////////
 // Extract or insert element from vector
@@ -1024,7 +1032,7 @@ def VQTBX4_A64 : WInst<"vqtbx4", "ddDt", "UccPcQUcQcQPc">;
 // With additional d, Qd, pl, Qpl types
 def REINTERPRET
   : NoTestOpInst<"vreinterpret", "dd",
-         "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPl", OP_REINT>;
+         "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", OP_REINT>;
 
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index 108c0f0887..8bf50a0adb 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -1624,6 +1624,11 @@ static llvm::VectorType *GetNeonType(CodeGenFunction *CGF,
   case NeonTypeFlags::Int64:
   case NeonTypeFlags::Poly64:
     return llvm::VectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad));
+  case NeonTypeFlags::Poly128:
+    // FIXME: i128 and f128 doesn't get fully support in Clang and llvm.
+    // There is a lot of i128 and f128 API missing.
+    // so we use v16i8 to represent poly128 and get pattern matched.
+    return llvm::VectorType::get(CGF->Int8Ty, 16);
   case NeonTypeFlags::Float32:
     return llvm::VectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad));
   case NeonTypeFlags::Float64:
@@ -2555,6 +2560,9 @@ static Value *EmitAArch64ScalarBuiltinExpr(CodeGenFunction &CGF,
   case AArch64::BI__builtin_neon_vcvtd_n_u64_f64:
     Int = Intrinsic::aarch64_neon_vcvtd_n_u64_f64;
     s = "fcvtzu"; OverloadInt = false; break;
+  case AArch64::BI__builtin_neon_vmull_p64:
+    Int = Intrinsic::aarch64_neon_vmull_p64;
+    s = "vmull"; OverloadInt = false; break;
   }
 
   if (!Int)
@@ -2908,6 +2916,28 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
 
   SmallVector<Value *, 4> Ops;
   llvm::Value *Align = 0; // Alignment for load/store
+
+  if (BuiltinID == AArch64::BI__builtin_neon_vldrq_p128) {
+   Value *Op = EmitScalarExpr(E->getArg(0));
+   unsigned addressSpace =
+     cast<llvm::PointerType>(Op->getType())->getAddressSpace();
+   llvm::Type *Ty = llvm::Type::getFP128PtrTy(getLLVMContext(), addressSpace);
+   Op = Builder.CreateBitCast(Op, Ty);
+   Op = Builder.CreateLoad(Op);
+   Ty = llvm::Type::getIntNTy(getLLVMContext(), 128);
+   return Builder.CreateBitCast(Op, Ty);
+  }
+  if (BuiltinID == AArch64::BI__builtin_neon_vstrq_p128) {
+    Value *Op0 = EmitScalarExpr(E->getArg(0));
+    unsigned addressSpace =
+      cast<llvm::PointerType>(Op0->getType())->getAddressSpace();
+    llvm::Type *PTy = llvm::Type::getFP128PtrTy(getLLVMContext(), addressSpace);
+    Op0 = Builder.CreateBitCast(Op0, PTy);
+    Value *Op1 = EmitScalarExpr(E->getArg(1));
+    llvm::Type *Ty = llvm::Type::getFP128Ty(getLLVMContext());
+    Op1 = Builder.CreateBitCast(Op1, Ty);
+    return Builder.CreateStore(Op1, Op0);
+  }
   for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) {
     if (i == 0) {
       switch (BuiltinID) {
diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp
index 7abbe7804c..97c041c901 100644
--- a/lib/Sema/SemaChecking.cpp
+++ b/lib/Sema/SemaChecking.cpp
@@ -341,6 +341,8 @@ static unsigned RFT(unsigned t, bool shift = false) {
   case NeonTypeFlags::Int64:
   case NeonTypeFlags::Poly64:
     return shift ? 63 : (1 << IsQuad) - 1;
+  case NeonTypeFlags::Poly128:
+    return shift ? 127 : (1 << IsQuad) - 1;
   case NeonTypeFlags::Float16:
     assert(!shift && "cannot shift float types!");
     return (4 << IsQuad) - 1;
@@ -374,6 +376,8 @@ static QualType getNeonEltType(NeonTypeFlags Flags, ASTContext &Context,
     return IsAArch64 ? Context.UnsignedShortTy : Context.ShortTy;
   case NeonTypeFlags::Poly64:
     return Context.UnsignedLongLongTy;
+  case NeonTypeFlags::Poly128:
+    break;
   case NeonTypeFlags::Float16:
     return Context.HalfTy;
   case NeonTypeFlags::Float32:
diff --git a/test/CodeGen/aarch64-poly128.c b/test/CodeGen/aarch64-poly128.c
new file mode 100644
index 0000000000..fde58077d2
--- /dev/null
+++ b/test/CodeGen/aarch64-poly128.c
@@ -0,0 +1,203 @@
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \
+// RUN:   -ffp-contract=fast -S -O3 -o - %s | FileCheck %s
+
+// Test new aarch64 intrinsics with poly128
+// FIXME: Currently, poly128_t equals to uint128, which will be spilt into
+// two 64-bit GPR(eg X0, X1). Now moving data from X0, X1 to FPR128 will
+// introduce 2 store and 1 load instructions(store X0, X1 to memory and
+// then load back to Q0). If target has NEON, this is better replaced by
+// FMOV or INS.
+
+#include <arm_neon.h>
+
+void test_vstrq_p128(poly128_t * ptr, poly128_t val) {
+  // CHECK: test_vstrq_p128
+  vstrq_p128(ptr, val);
+	// CHECK: str	{{x[0-9]+}}, [{{x[0-9]+}}, #8]
+	// CHECK-NEXT: str	 {{x[0-9]+}}, [{{x[0-9]+}}]
+}
+
+poly128_t test_vldrq_p128(poly128_t * ptr) {
+  // CHECK: test_vldrq_p128
+  return vldrq_p128(ptr);
+	// CHECK: ldr	{{x[0-9]+}}, [{{x[0-9]+}}]
+	// CHECK-NEXT: ldr	{{x[0-9]+}}, [{{x[0-9]+}}, #8]
+}
+
+void test_ld_st_p128(poly128_t * ptr) {
+  // CHECK: test_ld_st_p128
+   vstrq_p128(ptr+1, vldrq_p128(ptr));
+	// CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
+	// CHECK-NEXT: str	{{q[0-9]+}}, [{{x[0-9]+}}, #16]
+}
+
+poly128_t test_vmull_p64(poly64_t a, poly64_t b) {
+  // CHECK: test_vmull_p64
+  return vmull_p64(a, b);
+  // CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d
+}
+
+poly128_t test_vmull_high_p64(poly64x2_t a, poly64x2_t b) {
+  // CHECK: test_vmull_high_p64
+  return vmull_high_p64(a, b);
+  // CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_s8
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_s8(int8x16_t a) {
+  return vreinterpretq_p128_s8(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_s16
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_s16(int16x8_t a) {
+  return vreinterpretq_p128_s16(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_s32
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_s32(int32x4_t a) {
+  return vreinterpretq_p128_s32(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_s64
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_s64(int64x2_t a) {
+  return vreinterpretq_p128_s64(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_u8
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) {
+  return vreinterpretq_p128_u8(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_u16
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) {
+  return vreinterpretq_p128_u16(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_u32
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) {
+  return vreinterpretq_p128_u32(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_u64
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) {
+  return vreinterpretq_p128_u64(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_f32
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_f32(float32x4_t a) {
+  return vreinterpretq_p128_f32(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_f64
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_f64(float64x2_t a) {
+  return vreinterpretq_p128_f64(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_p8
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) {
+  return vreinterpretq_p128_p8(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_p16
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) {
+  return vreinterpretq_p128_p16(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p128_p64
+// CHECK: ret
+poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) {
+  return vreinterpretq_p128_p64(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_s8_p128
+// CHECK: ret
+int8x16_t test_vreinterpretq_s8_p128(poly128_t a) {
+  return vreinterpretq_s8_p128(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_s16_p128
+// CHECK: ret
+int16x8_t test_vreinterpretq_s16_p128(poly128_t  a) {
+  return vreinterpretq_s16_p128(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_s32_p128
+// CHECK: ret
+int32x4_t test_vreinterpretq_s32_p128(poly128_t a) {
+  return vreinterpretq_s32_p128(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_s64_p128
+// CHECK: ret
+int64x2_t test_vreinterpretq_s64_p128(poly128_t  a) {
+  return vreinterpretq_s64_p128(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_u8_p128
+// CHECK: ret
+uint8x16_t test_vreinterpretq_u8_p128(poly128_t  a) {
+  return vreinterpretq_u8_p128(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_u16_p128
+// CHECK: ret
+uint16x8_t test_vreinterpretq_u16_p128(poly128_t  a) {
+  return vreinterpretq_u16_p128(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_u32_p128
+// CHECK: ret
+uint32x4_t test_vreinterpretq_u32_p128(poly128_t  a) {
+  return vreinterpretq_u32_p128(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_u64_p128
+// CHECK: ret
+uint64x2_t test_vreinterpretq_u64_p128(poly128_t  a) {
+  return vreinterpretq_u64_p128(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_f32_p128
+// CHECK: ret
+float32x4_t test_vreinterpretq_f32_p128(poly128_t  a) {
+  return vreinterpretq_f32_p128(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_f64_p128
+// CHECK: ret
+float64x2_t test_vreinterpretq_f64_p128(poly128_t  a) {
+  return vreinterpretq_f64_p128(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p8_p128
+// CHECK: ret
+poly8x16_t test_vreinterpretq_p8_p128(poly128_t  a) {
+  return vreinterpretq_p8_p128(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p16_p128
+// CHECK: ret
+poly16x8_t test_vreinterpretq_p16_p128(poly128_t  a) {
+  return vreinterpretq_p16_p128(a);
+}
+
+// CHECK-LABEL: test_vreinterpretq_p64_p128
+// CHECK: ret
+poly64x2_t test_vreinterpretq_p64_p128(poly128_t  a) {
+  return vreinterpretq_p64_p128(a);
+}
+
+
diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp
index f9eb22e0e7..5329c57596 100644
--- a/utils/TableGen/NeonEmitter.cpp
+++ b/utils/TableGen/NeonEmitter.cpp
@@ -52,6 +52,7 @@ enum OpKind {
   OpMla,
   OpMlal,
   OpMullHi,
+  OpMullHiP64,
   OpMullHiN,
   OpMlalHi,
   OpMlalHiN,
@@ -193,6 +194,7 @@ public:
     Poly8,
     Poly16,
     Poly64,
+    Poly128,
     Float16,
     Float32,
     Float64
@@ -234,6 +236,7 @@ public:
     OpMap["OP_MLA"]   = OpMla;
     OpMap["OP_MLAL"]  = OpMlal;
     OpMap["OP_MULLHi"]  = OpMullHi;
+    OpMap["OP_MULLHi_P64"]  = OpMullHiP64;
     OpMap["OP_MULLHi_N"]  = OpMullHiN;
     OpMap["OP_MLALHi"]  = OpMlalHi;
     OpMap["OP_MLALHi_N"]  = OpMlalHiN;
@@ -403,6 +406,7 @@ static void ParseTypes(Record *r, std::string &s,
       case 's':
       case 'i':
       case 'l':
+      case 'k':
       case 'h':
       case 'f':
       case 'd':
@@ -427,6 +431,8 @@ static char Widen(const char t) {
       return 'i';
     case 'i':
       return 'l';
+    case 'l':
+      return 'k';
     case 'h':
       return 'f';
     case 'f':
@@ -446,6 +452,8 @@ static char Narrow(const char t) {
       return 's';
     case 'l':
       return 'i';
+    case 'k':
+      return 'l';
     case 'f':
       return 'h';
     case 'd':
@@ -469,6 +477,9 @@ static std::string GetNarrowTypestr(StringRef ty)
       case 'l':
         s += 'i';
         break;
+      case 'k':
+        s += 'l';
+        break;
       default:
         s += ty[i];
         break;
@@ -680,6 +691,9 @@ static std::string TypeString(const char mod, StringRef typestr) {
         break;
       s += quad ? "x2" : "x1";
       break;
+    case 'k':
+      s += "poly128";
+      break;
     case 'h':
       s += "float16";
       if (scal)
@@ -745,6 +759,9 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr,
   // Based on the modifying character, change the type and width if necessary.
   type = ModType(mod, type, quad, poly, usgn, scal, cnst, pntr);
 
+  usgn = usgn | poly | ((ck == ClassI || ck == ClassW) &&
+                         scal && type != 'f' && type != 'd');
+
   // All pointers are void* pointers.  Change type to 'v' now.
   if (pntr) {
     usgn = false;
@@ -756,8 +773,6 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr,
     type = 's';
     usgn = true;
   }
-  usgn = usgn | poly | ((ck == ClassI || ck == ClassW) &&
-                         scal && type != 'f' && type != 'd');
 
   if (scal) {
     SmallString<128> s;
@@ -769,6 +784,8 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr,
 
     if (type == 'l') // 64-bit long
       s += "LLi";
+    else if (type == 'k') // 128-bit long
+      s = "LLLi";
     else
       s.push_back(type);
 
@@ -865,6 +882,10 @@ static void InstructionTypeCode(const StringRef &typeStr,
     default: break;
     }
     break;
+  case 'k':
+    assert(poly && "Unrecognized 128 bit integer.");
+    typeCode = "p128";
+    break;
   case 'h':
     switch (ck) {
     case ClassS:
@@ -1605,6 +1626,7 @@ static unsigned GetNumElements(StringRef typestr, bool &quad) {
   case 's': nElts = 4; break;
   case 'i': nElts = 2; break;
   case 'l': nElts = 1; break;
+  case 'k': nElts = 1; break;
   case 'h': nElts = 4; break;
   case 'f': nElts = 2; break;
   case 'd':
@@ -1717,6 +1739,13 @@ static std::string GenOpString(const std::string &name, OpKind op,
   case OpMullHi:
     s += Gen2OpWith2High(typestr, "vmull", "__a", "__b");
     break;
+  case OpMullHiP64: {
+    std::string Op1 = GetHigh("__a", typestr);
+    std::string Op2 = GetHigh("__b", typestr);
+    s += MangleName("vmull", typestr, ClassS);
+    s += "((poly64_t)" + Op1 + ", (poly64_t)" + Op2 + ");";
+    break;
+  }
   case OpMullHiN:
     s += MangleName("vmull_n", typestr, ClassS);
     s += "(" + GetHigh("__a", typestr) + ", __b);";
@@ -2278,6 +2307,9 @@ static unsigned GetNeonEnum(const std::string &proto, StringRef typestr) {
     case 'l':
       ET = poly ? NeonTypeFlags::Poly64 : NeonTypeFlags::Int64;
       break;
+    case 'k':
+      ET = NeonTypeFlags::Poly128;
+      break;
     case 'h':
       ET = NeonTypeFlags::Float16;
       break;
@@ -2569,6 +2601,7 @@ void NeonEmitter::run(raw_ostream &OS) {
   OS << "typedef uint8_t poly8_t;\n";
   OS << "typedef uint16_t poly16_t;\n";
   OS << "typedef uint64_t poly64_t;\n";
+  OS << "typedef __uint128_t poly128_t;\n";
   OS << "#else\n";
   OS << "typedef int8_t poly8_t;\n";
   OS << "typedef int16_t poly16_t;\n";
@@ -2693,6 +2726,7 @@ void NeonEmitter::run(raw_ostream &OS) {
   // Emit AArch64-specific intrinsics.
   OS << "#ifdef __aarch64__\n";
 
+  emitIntrinsic(OS, Records.getDef("VMULL_P64"), EmittedMap);
   emitIntrinsic(OS, Records.getDef("VMOVL_HIGH"), EmittedMap);
   emitIntrinsic(OS, Records.getDef("VMULL_HIGH"), EmittedMap);
   emitIntrinsic(OS, Records.getDef("VABDL_HIGH"), EmittedMap);
@@ -2801,6 +2835,8 @@ static unsigned RangeFromType(const char mod, StringRef typestr) {
     case 'd':
     case 'l':
       return (1 << (int)quad) - 1;
+    case 'k':
+      return 0;
     default:
       PrintFatalError("unhandled type!");
   }
@@ -2824,6 +2860,8 @@ static unsigned RangeScalarShiftImm(const char mod, StringRef typestr) {
     case 'd':
     case 'l':
       return 63;
+    case 'k':
+      return 127;
     default:
       PrintFatalError("unhandled type!");
   }