From: Kevin Qin Date: Tue, 10 Dec 2013 06:49:01 +0000 (+0000) Subject: [AArch64 NEON] Support poly128_t and implement relevant intrinsic. X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3071f44f362397a188b0899f7f6f5e7c53377d19;p=clang [AArch64 NEON] Support poly128_t and implement relevant intrinsic. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@196888 91177308-0d34-0410-b5e6-96231b3b80d8 --- diff --git a/include/clang/Basic/TargetBuiltins.h b/include/clang/Basic/TargetBuiltins.h index ed3cc49fed..e2b5b2423f 100644 --- a/include/clang/Basic/TargetBuiltins.h +++ b/include/clang/Basic/TargetBuiltins.h @@ -91,6 +91,7 @@ namespace clang { Poly8, Poly16, Poly64, + Poly128, Float16, Float32, Float64 diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td index b918459f4e..a1065d9f72 100644 --- a/include/clang/Basic/arm_neon.td +++ b/include/clang/Basic/arm_neon.td @@ -30,6 +30,7 @@ def OP_MUL : Op; def OP_MLA : Op; def OP_MLAL : Op; def OP_MULLHi : Op; +def OP_MULLHi_P64 : Op; def OP_MULLHi_N : Op; def OP_MLALHi : Op; def OP_MLALHi_N : Op; @@ -224,6 +225,7 @@ class NoTestOpInst : Inst {} // s: short // i: int // l: long +// k: 128-bit long // f: float // h: half-float // d: double @@ -603,6 +605,9 @@ def LD3_DUP : WInst<"vld3_dup", "3c", def LD4_DUP : WInst<"vld4_dup", "4c", "QUcQUsQUiQUlQcQsQiQlQhQfQdQPcQPsQPlUcUsUiUlcsilhfdPcPsPl">; +def VLDRQ : WInst<"vldrq", "sc", "Pk">; +def VSTRQ : WInst<"vstrq", "vps", "Pk">; + //////////////////////////////////////////////////////////////////////////////// // Addition // With additional d, Qd type. @@ -854,6 +859,9 @@ def VQDMLAL_HIGH : SOpInst<"vqdmlal_high", "wwkk", "si", OP_QDMLALHi>; def VQDMLAL_HIGH_N : SOpInst<"vqdmlal_high_n", "wwks", "si", OP_QDMLALHi_N>; def VQDMLSL_HIGH : SOpInst<"vqdmlsl_high", "wwkk", "si", OP_QDMLSLHi>; def VQDMLSL_HIGH_N : SOpInst<"vqdmlsl_high_n", "wwks", "si", OP_QDMLSLHi_N>; +def VMULL_P64 : SInst<"vmull", "rss", "Pl">; +def VMULL_HIGH_P64 : SOpInst<"vmull_high", "rdd", "HPl", OP_MULLHi_P64>; + //////////////////////////////////////////////////////////////////////////////// // Extract or insert element from vector @@ -1024,7 +1032,7 @@ def VQTBX4_A64 : WInst<"vqtbx4", "ddDt", "UccPcQUcQcQPc">; // With additional d, Qd, pl, Qpl types def REINTERPRET : NoTestOpInst<"vreinterpret", "dd", - "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPl", OP_REINT>; + "csilUcUsUiUlhfdPcPsPlQcQsQiQlQUcQUsQUiQUlQhQfQdQPcQPsQPlQPk", OP_REINT>; //////////////////////////////////////////////////////////////////////////////// diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp index 108c0f0887..8bf50a0adb 100644 --- a/lib/CodeGen/CGBuiltin.cpp +++ b/lib/CodeGen/CGBuiltin.cpp @@ -1624,6 +1624,11 @@ static llvm::VectorType *GetNeonType(CodeGenFunction *CGF, case NeonTypeFlags::Int64: case NeonTypeFlags::Poly64: return llvm::VectorType::get(CGF->Int64Ty, V1Ty ? 1 : (1 << IsQuad)); + case NeonTypeFlags::Poly128: + // FIXME: i128 and f128 doesn't get fully support in Clang and llvm. + // There is a lot of i128 and f128 API missing. + // so we use v16i8 to represent poly128 and get pattern matched. + return llvm::VectorType::get(CGF->Int8Ty, 16); case NeonTypeFlags::Float32: return llvm::VectorType::get(CGF->FloatTy, V1Ty ? 1 : (2 << IsQuad)); case NeonTypeFlags::Float64: @@ -2555,6 +2560,9 @@ static Value *EmitAArch64ScalarBuiltinExpr(CodeGenFunction &CGF, case AArch64::BI__builtin_neon_vcvtd_n_u64_f64: Int = Intrinsic::aarch64_neon_vcvtd_n_u64_f64; s = "fcvtzu"; OverloadInt = false; break; + case AArch64::BI__builtin_neon_vmull_p64: + Int = Intrinsic::aarch64_neon_vmull_p64; + s = "vmull"; OverloadInt = false; break; } if (!Int) @@ -2908,6 +2916,28 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, SmallVector Ops; llvm::Value *Align = 0; // Alignment for load/store + + if (BuiltinID == AArch64::BI__builtin_neon_vldrq_p128) { + Value *Op = EmitScalarExpr(E->getArg(0)); + unsigned addressSpace = + cast(Op->getType())->getAddressSpace(); + llvm::Type *Ty = llvm::Type::getFP128PtrTy(getLLVMContext(), addressSpace); + Op = Builder.CreateBitCast(Op, Ty); + Op = Builder.CreateLoad(Op); + Ty = llvm::Type::getIntNTy(getLLVMContext(), 128); + return Builder.CreateBitCast(Op, Ty); + } + if (BuiltinID == AArch64::BI__builtin_neon_vstrq_p128) { + Value *Op0 = EmitScalarExpr(E->getArg(0)); + unsigned addressSpace = + cast(Op0->getType())->getAddressSpace(); + llvm::Type *PTy = llvm::Type::getFP128PtrTy(getLLVMContext(), addressSpace); + Op0 = Builder.CreateBitCast(Op0, PTy); + Value *Op1 = EmitScalarExpr(E->getArg(1)); + llvm::Type *Ty = llvm::Type::getFP128Ty(getLLVMContext()); + Op1 = Builder.CreateBitCast(Op1, Ty); + return Builder.CreateStore(Op1, Op0); + } for (unsigned i = 0, e = E->getNumArgs() - 1; i != e; i++) { if (i == 0) { switch (BuiltinID) { diff --git a/lib/Sema/SemaChecking.cpp b/lib/Sema/SemaChecking.cpp index 7abbe7804c..97c041c901 100644 --- a/lib/Sema/SemaChecking.cpp +++ b/lib/Sema/SemaChecking.cpp @@ -341,6 +341,8 @@ static unsigned RFT(unsigned t, bool shift = false) { case NeonTypeFlags::Int64: case NeonTypeFlags::Poly64: return shift ? 63 : (1 << IsQuad) - 1; + case NeonTypeFlags::Poly128: + return shift ? 127 : (1 << IsQuad) - 1; case NeonTypeFlags::Float16: assert(!shift && "cannot shift float types!"); return (4 << IsQuad) - 1; @@ -374,6 +376,8 @@ static QualType getNeonEltType(NeonTypeFlags Flags, ASTContext &Context, return IsAArch64 ? Context.UnsignedShortTy : Context.ShortTy; case NeonTypeFlags::Poly64: return Context.UnsignedLongLongTy; + case NeonTypeFlags::Poly128: + break; case NeonTypeFlags::Float16: return Context.HalfTy; case NeonTypeFlags::Float32: diff --git a/test/CodeGen/aarch64-poly128.c b/test/CodeGen/aarch64-poly128.c new file mode 100644 index 0000000000..fde58077d2 --- /dev/null +++ b/test/CodeGen/aarch64-poly128.c @@ -0,0 +1,203 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon \ +// RUN: -ffp-contract=fast -S -O3 -o - %s | FileCheck %s + +// Test new aarch64 intrinsics with poly128 +// FIXME: Currently, poly128_t equals to uint128, which will be spilt into +// two 64-bit GPR(eg X0, X1). Now moving data from X0, X1 to FPR128 will +// introduce 2 store and 1 load instructions(store X0, X1 to memory and +// then load back to Q0). If target has NEON, this is better replaced by +// FMOV or INS. + +#include + +void test_vstrq_p128(poly128_t * ptr, poly128_t val) { + // CHECK: test_vstrq_p128 + vstrq_p128(ptr, val); + // CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}, #8] + // CHECK-NEXT: str {{x[0-9]+}}, [{{x[0-9]+}}] +} + +poly128_t test_vldrq_p128(poly128_t * ptr) { + // CHECK: test_vldrq_p128 + return vldrq_p128(ptr); + // CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}] + // CHECK-NEXT: ldr {{x[0-9]+}}, [{{x[0-9]+}}, #8] +} + +void test_ld_st_p128(poly128_t * ptr) { + // CHECK: test_ld_st_p128 + vstrq_p128(ptr+1, vldrq_p128(ptr)); + // CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}] + // CHECK-NEXT: str {{q[0-9]+}}, [{{x[0-9]+}}, #16] +} + +poly128_t test_vmull_p64(poly64_t a, poly64_t b) { + // CHECK: test_vmull_p64 + return vmull_p64(a, b); + // CHECK: pmull {{v[0-9]+}}.1q, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d +} + +poly128_t test_vmull_high_p64(poly64x2_t a, poly64x2_t b) { + // CHECK: test_vmull_high_p64 + return vmull_high_p64(a, b); + // CHECK: pmull2 {{v[0-9]+}}.1q, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +} + +// CHECK-LABEL: test_vreinterpretq_p128_s8 +// CHECK: ret +poly128_t test_vreinterpretq_p128_s8(int8x16_t a) { + return vreinterpretq_p128_s8(a); +} + +// CHECK-LABEL: test_vreinterpretq_p128_s16 +// CHECK: ret +poly128_t test_vreinterpretq_p128_s16(int16x8_t a) { + return vreinterpretq_p128_s16(a); +} + +// CHECK-LABEL: test_vreinterpretq_p128_s32 +// CHECK: ret +poly128_t test_vreinterpretq_p128_s32(int32x4_t a) { + return vreinterpretq_p128_s32(a); +} + +// CHECK-LABEL: test_vreinterpretq_p128_s64 +// CHECK: ret +poly128_t test_vreinterpretq_p128_s64(int64x2_t a) { + return vreinterpretq_p128_s64(a); +} + +// CHECK-LABEL: test_vreinterpretq_p128_u8 +// CHECK: ret +poly128_t test_vreinterpretq_p128_u8(uint8x16_t a) { + return vreinterpretq_p128_u8(a); +} + +// CHECK-LABEL: test_vreinterpretq_p128_u16 +// CHECK: ret +poly128_t test_vreinterpretq_p128_u16(uint16x8_t a) { + return vreinterpretq_p128_u16(a); +} + +// CHECK-LABEL: test_vreinterpretq_p128_u32 +// CHECK: ret +poly128_t test_vreinterpretq_p128_u32(uint32x4_t a) { + return vreinterpretq_p128_u32(a); +} + +// CHECK-LABEL: test_vreinterpretq_p128_u64 +// CHECK: ret +poly128_t test_vreinterpretq_p128_u64(uint64x2_t a) { + return vreinterpretq_p128_u64(a); +} + +// CHECK-LABEL: test_vreinterpretq_p128_f32 +// CHECK: ret +poly128_t test_vreinterpretq_p128_f32(float32x4_t a) { + return vreinterpretq_p128_f32(a); +} + +// CHECK-LABEL: test_vreinterpretq_p128_f64 +// CHECK: ret +poly128_t test_vreinterpretq_p128_f64(float64x2_t a) { + return vreinterpretq_p128_f64(a); +} + +// CHECK-LABEL: test_vreinterpretq_p128_p8 +// CHECK: ret +poly128_t test_vreinterpretq_p128_p8(poly8x16_t a) { + return vreinterpretq_p128_p8(a); +} + +// CHECK-LABEL: test_vreinterpretq_p128_p16 +// CHECK: ret +poly128_t test_vreinterpretq_p128_p16(poly16x8_t a) { + return vreinterpretq_p128_p16(a); +} + +// CHECK-LABEL: test_vreinterpretq_p128_p64 +// CHECK: ret +poly128_t test_vreinterpretq_p128_p64(poly64x2_t a) { + return vreinterpretq_p128_p64(a); +} + +// CHECK-LABEL: test_vreinterpretq_s8_p128 +// CHECK: ret +int8x16_t test_vreinterpretq_s8_p128(poly128_t a) { + return vreinterpretq_s8_p128(a); +} + +// CHECK-LABEL: test_vreinterpretq_s16_p128 +// CHECK: ret +int16x8_t test_vreinterpretq_s16_p128(poly128_t a) { + return vreinterpretq_s16_p128(a); +} + +// CHECK-LABEL: test_vreinterpretq_s32_p128 +// CHECK: ret +int32x4_t test_vreinterpretq_s32_p128(poly128_t a) { + return vreinterpretq_s32_p128(a); +} + +// CHECK-LABEL: test_vreinterpretq_s64_p128 +// CHECK: ret +int64x2_t test_vreinterpretq_s64_p128(poly128_t a) { + return vreinterpretq_s64_p128(a); +} + +// CHECK-LABEL: test_vreinterpretq_u8_p128 +// CHECK: ret +uint8x16_t test_vreinterpretq_u8_p128(poly128_t a) { + return vreinterpretq_u8_p128(a); +} + +// CHECK-LABEL: test_vreinterpretq_u16_p128 +// CHECK: ret +uint16x8_t test_vreinterpretq_u16_p128(poly128_t a) { + return vreinterpretq_u16_p128(a); +} + +// CHECK-LABEL: test_vreinterpretq_u32_p128 +// CHECK: ret +uint32x4_t test_vreinterpretq_u32_p128(poly128_t a) { + return vreinterpretq_u32_p128(a); +} + +// CHECK-LABEL: test_vreinterpretq_u64_p128 +// CHECK: ret +uint64x2_t test_vreinterpretq_u64_p128(poly128_t a) { + return vreinterpretq_u64_p128(a); +} + +// CHECK-LABEL: test_vreinterpretq_f32_p128 +// CHECK: ret +float32x4_t test_vreinterpretq_f32_p128(poly128_t a) { + return vreinterpretq_f32_p128(a); +} + +// CHECK-LABEL: test_vreinterpretq_f64_p128 +// CHECK: ret +float64x2_t test_vreinterpretq_f64_p128(poly128_t a) { + return vreinterpretq_f64_p128(a); +} + +// CHECK-LABEL: test_vreinterpretq_p8_p128 +// CHECK: ret +poly8x16_t test_vreinterpretq_p8_p128(poly128_t a) { + return vreinterpretq_p8_p128(a); +} + +// CHECK-LABEL: test_vreinterpretq_p16_p128 +// CHECK: ret +poly16x8_t test_vreinterpretq_p16_p128(poly128_t a) { + return vreinterpretq_p16_p128(a); +} + +// CHECK-LABEL: test_vreinterpretq_p64_p128 +// CHECK: ret +poly64x2_t test_vreinterpretq_p64_p128(poly128_t a) { + return vreinterpretq_p64_p128(a); +} + + diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp index f9eb22e0e7..5329c57596 100644 --- a/utils/TableGen/NeonEmitter.cpp +++ b/utils/TableGen/NeonEmitter.cpp @@ -52,6 +52,7 @@ enum OpKind { OpMla, OpMlal, OpMullHi, + OpMullHiP64, OpMullHiN, OpMlalHi, OpMlalHiN, @@ -193,6 +194,7 @@ public: Poly8, Poly16, Poly64, + Poly128, Float16, Float32, Float64 @@ -234,6 +236,7 @@ public: OpMap["OP_MLA"] = OpMla; OpMap["OP_MLAL"] = OpMlal; OpMap["OP_MULLHi"] = OpMullHi; + OpMap["OP_MULLHi_P64"] = OpMullHiP64; OpMap["OP_MULLHi_N"] = OpMullHiN; OpMap["OP_MLALHi"] = OpMlalHi; OpMap["OP_MLALHi_N"] = OpMlalHiN; @@ -403,6 +406,7 @@ static void ParseTypes(Record *r, std::string &s, case 's': case 'i': case 'l': + case 'k': case 'h': case 'f': case 'd': @@ -427,6 +431,8 @@ static char Widen(const char t) { return 'i'; case 'i': return 'l'; + case 'l': + return 'k'; case 'h': return 'f'; case 'f': @@ -446,6 +452,8 @@ static char Narrow(const char t) { return 's'; case 'l': return 'i'; + case 'k': + return 'l'; case 'f': return 'h'; case 'd': @@ -469,6 +477,9 @@ static std::string GetNarrowTypestr(StringRef ty) case 'l': s += 'i'; break; + case 'k': + s += 'l'; + break; default: s += ty[i]; break; @@ -680,6 +691,9 @@ static std::string TypeString(const char mod, StringRef typestr) { break; s += quad ? "x2" : "x1"; break; + case 'k': + s += "poly128"; + break; case 'h': s += "float16"; if (scal) @@ -745,6 +759,9 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr, // Based on the modifying character, change the type and width if necessary. type = ModType(mod, type, quad, poly, usgn, scal, cnst, pntr); + usgn = usgn | poly | ((ck == ClassI || ck == ClassW) && + scal && type != 'f' && type != 'd'); + // All pointers are void* pointers. Change type to 'v' now. if (pntr) { usgn = false; @@ -756,8 +773,6 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr, type = 's'; usgn = true; } - usgn = usgn | poly | ((ck == ClassI || ck == ClassW) && - scal && type != 'f' && type != 'd'); if (scal) { SmallString<128> s; @@ -769,6 +784,8 @@ static std::string BuiltinTypeString(const char mod, StringRef typestr, if (type == 'l') // 64-bit long s += "LLi"; + else if (type == 'k') // 128-bit long + s = "LLLi"; else s.push_back(type); @@ -865,6 +882,10 @@ static void InstructionTypeCode(const StringRef &typeStr, default: break; } break; + case 'k': + assert(poly && "Unrecognized 128 bit integer."); + typeCode = "p128"; + break; case 'h': switch (ck) { case ClassS: @@ -1605,6 +1626,7 @@ static unsigned GetNumElements(StringRef typestr, bool &quad) { case 's': nElts = 4; break; case 'i': nElts = 2; break; case 'l': nElts = 1; break; + case 'k': nElts = 1; break; case 'h': nElts = 4; break; case 'f': nElts = 2; break; case 'd': @@ -1717,6 +1739,13 @@ static std::string GenOpString(const std::string &name, OpKind op, case OpMullHi: s += Gen2OpWith2High(typestr, "vmull", "__a", "__b"); break; + case OpMullHiP64: { + std::string Op1 = GetHigh("__a", typestr); + std::string Op2 = GetHigh("__b", typestr); + s += MangleName("vmull", typestr, ClassS); + s += "((poly64_t)" + Op1 + ", (poly64_t)" + Op2 + ");"; + break; + } case OpMullHiN: s += MangleName("vmull_n", typestr, ClassS); s += "(" + GetHigh("__a", typestr) + ", __b);"; @@ -2278,6 +2307,9 @@ static unsigned GetNeonEnum(const std::string &proto, StringRef typestr) { case 'l': ET = poly ? NeonTypeFlags::Poly64 : NeonTypeFlags::Int64; break; + case 'k': + ET = NeonTypeFlags::Poly128; + break; case 'h': ET = NeonTypeFlags::Float16; break; @@ -2569,6 +2601,7 @@ void NeonEmitter::run(raw_ostream &OS) { OS << "typedef uint8_t poly8_t;\n"; OS << "typedef uint16_t poly16_t;\n"; OS << "typedef uint64_t poly64_t;\n"; + OS << "typedef __uint128_t poly128_t;\n"; OS << "#else\n"; OS << "typedef int8_t poly8_t;\n"; OS << "typedef int16_t poly16_t;\n"; @@ -2693,6 +2726,7 @@ void NeonEmitter::run(raw_ostream &OS) { // Emit AArch64-specific intrinsics. OS << "#ifdef __aarch64__\n"; + emitIntrinsic(OS, Records.getDef("VMULL_P64"), EmittedMap); emitIntrinsic(OS, Records.getDef("VMOVL_HIGH"), EmittedMap); emitIntrinsic(OS, Records.getDef("VMULL_HIGH"), EmittedMap); emitIntrinsic(OS, Records.getDef("VABDL_HIGH"), EmittedMap); @@ -2801,6 +2835,8 @@ static unsigned RangeFromType(const char mod, StringRef typestr) { case 'd': case 'l': return (1 << (int)quad) - 1; + case 'k': + return 0; default: PrintFatalError("unhandled type!"); } @@ -2824,6 +2860,8 @@ static unsigned RangeScalarShiftImm(const char mod, StringRef typestr) { case 'd': case 'l': return 63; + case 'k': + return 127; default: PrintFatalError("unhandled type!"); }