[AArch64] Implement FP16FML intrinsics

author Bryan Chan <bryan.chan@huawei.com>

Thu, 25 Oct 2018 23:47:00 +0000 (23:47 +0000)

committer Bryan Chan <bryan.chan@huawei.com>

Thu, 25 Oct 2018 23:47:00 +0000 (23:47 +0000)
author Bryan Chan <bryan.chan@huawei.com>
Thu, 25 Oct 2018 23:47:00 +0000 (23:47 +0000)
committer Bryan Chan <bryan.chan@huawei.com>
Thu, 25 Oct 2018 23:47:00 +0000 (23:47 +0000)
diff --git a/include/clang/Basic/arm_neon.td b/include/clang/Basic/arm_neon.td

index 40c9547746ef2ebb70d28761bce75f87fe95fc6c..af049be7e295618681a8fe6a2b4268bf4f53064f 100644 (file)
--- a/include/clang/Basic/arm_neon.td
+++ b/include/clang/Basic/arm_neon.td
@@ -206,6 +206,15 @@ def OP_DOT_LNQ
      : Op<(call "vdot", $p0, $p1,
            (bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>;
  
+def OP_FMLAL_LN     : Op<(call "vfmlal_low", $p0, $p1,
+                           (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
+def OP_FMLSL_LN     : Op<(call "vfmlsl_low", $p0, $p1,
+                           (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
+def OP_FMLAL_LN_Hi  : Op<(call "vfmlal_high", $p0, $p1,
+                           (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
+def OP_FMLSL_LN_Hi  : Op<(call "vfmlsl_high", $p0, $p1,
+                           (dup_typed $p1, (call "vget_lane", $p2, $p3)))>;
+
  //===----------------------------------------------------------------------===//
  // Instructions
  //===----------------------------------------------------------------------===//
@@ -1640,3 +1649,21 @@ let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in {
    // Variants indexing into a 128-bit vector are A64 only.
    def UDOT_LANEQ : SOpInst<"vdot_laneq", "dd89i", "iUiQiQUi", OP_DOT_LNQ>;
  }
+
+// v8.2-A FP16 fused multiply-add long instructions.
+let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in {
+  def VFMLAL_LOW  : SInst<"vfmlal_low", "ffHH", "UiQUi">;
+  def VFMLSL_LOW  : SInst<"vfmlsl_low", "ffHH", "UiQUi">;
+  def VFMLAL_HIGH : SInst<"vfmlal_high", "ffHH", "UiQUi">;
+  def VFMLSL_HIGH : SInst<"vfmlsl_high", "ffHH", "UiQUi">;
+
+  def VFMLAL_LANE_LOW  : SOpInst<"vfmlal_lane_low", "ffH0i", "UiQUi", OP_FMLAL_LN>;
+  def VFMLSL_LANE_LOW  : SOpInst<"vfmlsl_lane_low", "ffH0i", "UiQUi", OP_FMLSL_LN>;
+  def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "ffH0i", "UiQUi", OP_FMLAL_LN_Hi>;
+  def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "ffH0i", "UiQUi", OP_FMLSL_LN_Hi>;
+
+  def VFMLAL_LANEQ_LOW  : SOpInst<"vfmlal_laneq_low", "ffH1i", "UiQUi", OP_FMLAL_LN>;
+  def VFMLSL_LANEQ_LOW  : SOpInst<"vfmlsl_laneq_low", "ffH1i", "UiQUi", OP_FMLSL_LN>;
+  def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "ffH1i", "UiQUi", OP_FMLAL_LN_Hi>;
+  def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "ffH1i", "UiQUi", OP_FMLSL_LN_Hi>;
+}
diff --git a/include/clang/Basic/arm_neon_incl.td b/include/clang/Basic/arm_neon_incl.td

index 5f3aa6fc02213a4f327a5e1f3fad7d63dda3fd79..09df22ea5ca245ef92946e98c4542bcaffef26f0 100644 (file)
--- a/include/clang/Basic/arm_neon_incl.td
+++ b/include/clang/Basic/arm_neon_incl.td
@@ -96,6 +96,11 @@ def bitcast;
  // example: (dup $p1) -> "(uint32x2_t) {__p1, __p1}" (assuming the base type
  //          is uint32x2_t).
  def dup;
+// dup_typed - Take a vector and a scalar argument, and create a new vector of
+//             the same type by duplicating the scalar value into all lanes.
+// example: (dup_typed $p1, $p2) -> "(float16x4_t) {__p2, __p2, __p2, __p2}"
+//          (assuming __p1 is float16x4_t, and __p2 is a compatible scalar).
+def dup_typed;
  // splat - Take a vector and a lane index, and return a vector of the same type
  //         containing repeated instances of the source vector at the lane index.
  // example: (splat $p0, $p1) ->
@@ -229,6 +234,8 @@ def OP_UNAVAILABLE : Operation {
  // f: float (int args)
  // F: double (int args)
  // H: half (int args)
+// 0: half (int args), ignore 'Q' size modifier.
+// 1: half (int args), force 'Q' size modifier.
  // d: default
  // g: default, ignore 'Q' size modifier.
  // j: default, force 'Q' size modifier.
diff --git a/lib/Basic/Targets/AArch64.cpp b/lib/Basic/Targets/AArch64.cpp

index b3a01ae25944fdf388f9cb6de309b1843606d0b1..1f91214a451b7e963b81641a49201e24ad058d84 100644 (file)
--- a/lib/Basic/Targets/AArch64.cpp
+++ b/lib/Basic/Targets/AArch64.cpp
@@ -194,6 +194,9 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
    if (HasDotProd)
      Builder.defineMacro("__ARM_FEATURE_DOTPROD", "1");
  
+  if ((FPU & NeonMode) && HasFP16FML)
+    Builder.defineMacro("__ARM_FEATURE_FP16FML", "1");
+
    switch (ArchKind) {
    default:
      break;
@@ -231,6 +234,7 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
    Unaligned = 1;
    HasFullFP16 = 0;
    HasDotProd = 0;
+  HasFP16FML = 0;
    ArchKind = llvm::AArch64::ArchKind::ARMV8A;
  
    for (const auto &Feature : Features) {
@@ -252,6 +256,8 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
        HasFullFP16 = 1;
      if (Feature == "+dotprod")
        HasDotProd = 1;
+    if (Feature == "+fp16fml")
+      HasFP16FML = 1;
    }
  
    setDataLayout();
diff --git a/lib/Basic/Targets/AArch64.h b/lib/Basic/Targets/AArch64.h

index a9df895e4dad3ea11a815771c73e86278894c412..d7f767abd4d10c5fd21bf093e7b7fce5f2303b17 100644 (file)
--- a/lib/Basic/Targets/AArch64.h
+++ b/lib/Basic/Targets/AArch64.h
@@ -34,6 +34,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
    unsigned Unaligned;
    unsigned HasFullFP16;
    unsigned HasDotProd;
+  unsigned HasFP16FML;
    llvm::AArch64::ArchKind ArchKind;
  
    static const Builtin::Info BuiltinInfo[];
diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp

index 3e8debbdb0c899cf6724c4dfaceb489a8872c192..f058109ed5ddbcd6f72dcb4a2612884d89799aac 100644 (file)
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -4368,6 +4368,14 @@ static const NeonIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
    NEONMAP0(vextq_v),
    NEONMAP0(vfma_v),
    NEONMAP0(vfmaq_v),
+  NEONMAP1(vfmlal_high_v, aarch64_neon_fmlal2, 0),
+  NEONMAP1(vfmlal_low_v, aarch64_neon_fmlal, 0),
+  NEONMAP1(vfmlalq_high_v, aarch64_neon_fmlal2, 0),
+  NEONMAP1(vfmlalq_low_v, aarch64_neon_fmlal, 0),
+  NEONMAP1(vfmlsl_high_v, aarch64_neon_fmlsl2, 0),
+  NEONMAP1(vfmlsl_low_v, aarch64_neon_fmlsl, 0),
+  NEONMAP1(vfmlslq_high_v, aarch64_neon_fmlsl2, 0),
+  NEONMAP1(vfmlslq_low_v, aarch64_neon_fmlsl, 0),
    NEONMAP2(vhadd_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
    NEONMAP2(vhaddq_v, aarch64_neon_uhadd, aarch64_neon_shadd, Add1ArgType | UnsignedAlts),
    NEONMAP2(vhsub_v, aarch64_neon_uhsub, aarch64_neon_shsub, Add1ArgType | UnsignedAlts),
@@ -5341,6 +5349,34 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
      Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic;
      return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vdot");
    }
+  case NEON::BI__builtin_neon_vfmlal_low_v:
+  case NEON::BI__builtin_neon_vfmlalq_low_v: {
+    llvm::Type *InputTy =
+        llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_low");
+  }
+  case NEON::BI__builtin_neon_vfmlsl_low_v:
+  case NEON::BI__builtin_neon_vfmlslq_low_v: {
+    llvm::Type *InputTy =
+        llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_low");
+  }
+  case NEON::BI__builtin_neon_vfmlal_high_v:
+  case NEON::BI__builtin_neon_vfmlalq_high_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlal_high");
+  }
+  case NEON::BI__builtin_neon_vfmlsl_high_v:
+  case NEON::BI__builtin_neon_vfmlslq_high_v: {
+    llvm::Type *InputTy =
+           llvm::VectorType::get(HalfTy, Ty->getPrimitiveSizeInBits() / 16);
+    llvm::Type *Tys[2] = { Ty, InputTy };
+    return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high");
+  }
    }
  
    assert(Int && "Expected valid intrinsic number");
diff --git a/test/CodeGen/aarch64-neon-fp16fml.c b/test/CodeGen/aarch64-neon-fp16fml.c

new file mode 100644 (file)

index 0000000..ad3dd9c
--- /dev/null
+++ b/test/CodeGen/aarch64-neon-fp16fml.c
@@ -0,0 +1,196 @@
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +v8.2a -target-feature +neon -target-feature +fp16fml \
+// RUN: -fallow-half-arguments-and-returns -disable-O0-optnone -emit-llvm -o - %s | opt -S -instcombine | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+// Test AArch64 Armv8.2-A FP16 Fused Multiply-Add Long intrinsics
+
+#include <arm_neon.h>
+
+// Vector form
+
+float32x2_t test_vfmlal_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlal_low_u32(a, b, c);
+}
+
+float32x2_t test_vfmlsl_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlsl_low_u32(a, b, c);
+}
+
+float32x2_t test_vfmlal_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlal_high_u32(a, b, c);
+}
+
+float32x2_t test_vfmlsl_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlsl_high_u32(a, b, c);
+}
+
+float32x4_t test_vfmlalq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlalq_low_u32(a, b, c);
+}
+
+float32x4_t test_vfmlslq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlslq_low_u32(a, b, c);
+}
+
+float32x4_t test_vfmlalq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlalq_high_u32(a, b, c);
+}
+
+float32x4_t test_vfmlslq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlslq_high_u32(a, b, c);
+}
+
+// Indexed form
+
+float32x2_t test_vfmlal_lane_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_lane_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlal_lane_low_u32(a, b, c, 0);
+}
+
+float32x2_t test_vfmlal_lane_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_lane_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlal_lane_high_u32(a, b, c, 1);
+}
+
+float32x4_t test_vfmlalq_lane_low_u32(float32x4_t a, float16x8_t b, float16x4_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_lane_low_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlalq_lane_low_u32(a, b, c, 2);
+}
+
+float32x4_t test_vfmlalq_lane_high_u32(float32x4_t a, float16x8_t b, float16x4_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_lane_high_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlalq_lane_high_u32(a, b, c, 3);
+}
+
+float32x2_t test_vfmlal_laneq_low_u32(float32x2_t a, float16x4_t b, float16x8_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_laneq_low_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlal_laneq_low_u32(a, b, c, 4);
+}
+
+float32x2_t test_vfmlal_laneq_high_u32(float32x2_t a, float16x4_t b, float16x8_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlal_laneq_high_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlal2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlal_laneq_high_u32(a, b, c, 5);
+}
+
+float32x4_t test_vfmlalq_laneq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_laneq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlalq_laneq_low_u32(a, b, c, 6);
+}
+
+float32x4_t test_vfmlalq_laneq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlalq_laneq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlal2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlalq_laneq_high_u32(a, b, c, 7);
+}
+
+float32x2_t test_vfmlsl_lane_low_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_lane_low_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> zeroinitializer
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlsl_lane_low_u32(a, b, c, 0);
+}
+
+float32x2_t test_vfmlsl_lane_high_u32(float32x2_t a, float16x4_t b, float16x4_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_lane_high_u32(<2 x float> %a, <4 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlsl_lane_high_u32(a, b, c, 1);
+}
+
+float32x4_t test_vfmlslq_lane_low_u32(float32x4_t a, float16x8_t b, float16x4_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_lane_low_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlslq_lane_low_u32(a, b, c, 2);
+}
+
+float32x4_t test_vfmlslq_lane_high_u32(float32x4_t a, float16x8_t b, float16x4_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_lane_high_u32(<4 x float> %a, <8 x half> %b, <4 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x half> %c, <4 x half> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlslq_lane_high_u32(a, b, c, 3);
+}
+
+float32x2_t test_vfmlsl_laneq_low_u32(float32x2_t a, float16x4_t b, float16x8_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_laneq_low_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlsl_laneq_low_u32(a, b, c, 4);
+}
+
+float32x2_t test_vfmlsl_laneq_high_u32(float32x2_t a, float16x4_t b, float16x8_t c) {
+// CHECK-LABEL: define <2 x float> @test_vfmlsl_laneq_high_u32(<2 x float> %a, <4 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+// CHECK: [[RESULT:%.*]] = call <2 x float> @llvm.aarch64.neon.fmlsl2.v2f32.v4f16(<2 x float> %a, <4 x half> %b, <4 x half> [[SHUFFLE]])
+// CHECK: ret <2 x float> [[RESULT]]
+  return vfmlsl_laneq_high_u32(a, b, c, 5);
+}
+
+float32x4_t test_vfmlslq_laneq_low_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_laneq_low_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6, i32 6>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlslq_laneq_low_u32(a, b, c, 6);
+}
+
+float32x4_t test_vfmlslq_laneq_high_u32(float32x4_t a, float16x8_t b, float16x8_t c) {
+// CHECK-LABEL: define <4 x float> @test_vfmlslq_laneq_high_u32(<4 x float> %a, <8 x half> %b, <8 x half> %c)
+// CHECK: [[SHUFFLE:%.*]] = shufflevector <8 x half> %c, <8 x half> undef, <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+// CHECK: [[RESULT:%.*]] = call <4 x float> @llvm.aarch64.neon.fmlsl2.v4f32.v8f16(<4 x float> %a, <8 x half> %b, <8 x half> [[SHUFFLE]])
+// CHECK: ret <4 x float> [[RESULT]]
+  return vfmlslq_laneq_high_u32(a, b, c, 7);
+}
diff --git a/test/Preprocessor/aarch64-target-features.c b/test/Preprocessor/aarch64-target-features.c

index 8e830d237ce2797d0d213d473d8ff06a371ec0f2..1616b7fc8a350b242f8584baabb59c4b00f5982f 100644 (file)
--- a/test/Preprocessor/aarch64-target-features.c
+++ b/test/Preprocessor/aarch64-target-features.c
@@ -93,16 +93,20 @@
  // RUN: %clang -target aarch64-none-linux-gnu -march=armv8.2a+dotprod -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-DOTPROD %s
  // CHECK-DOTPROD: __ARM_FEATURE_DOTPROD 1
  
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
-// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// On ARMv8.2-A and above, +fp16fml implies +fp16.
+// On ARMv8.4-A and above, +fp16 implies +fp16fml.
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.2-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-FML --check-prefix=CHECK-FULLFP16-VECTOR-SCALAR %s
+// CHECK-FULLFP16-FML:           #define __ARM_FEATURE_FP16FML 1
+// CHECK-FULLFP16-NOFML-NOT:     #define __ARM_FEATURE_FP16FML 1
  // CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
  // CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
  // CHECK-FULLFP16-VECTOR-SCALAR: #define __ARM_FP 0xE
@@ -114,6 +118,7 @@
  // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
  // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
  // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16+nosimd -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-SCALAR %s
+// CHECK-FULLFP16-SCALAR-NOT:   #define __ARM_FEATURE_FP16FML 1
  // CHECK-FULLFP16-SCALAR:       #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
  // CHECK-FULLFP16-SCALAR-NOT:   #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
  // CHECK-FULLFP16-SCALAR:       #define __ARM_FP 0xE
@@ -127,10 +132,11 @@
  // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
  // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+nofp16fml -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
  // RUN: %clang -target aarch64-none-linux-gnueabi -march=armv8.4-a+fp16fml+nofp16 -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-FULLFP16-NOFML-VECTOR-SCALAR %s
+// CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16FML 1
  // CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
  // CHECK-FULLFP16-NOFML-VECTOR-SCALAR-NOT: #define __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
-// CHECK-FULLFP16-NOFML-VECTOR-SCALAR: #define __ARM_FP 0xE
-// CHECK-FULLFP16-NOFML-VECTOR-SCALAR: #define __ARM_FP16_FORMAT_IEEE 1
+// CHECK-FULLFP16-NOFML-VECTOR-SCALAR:     #define __ARM_FP 0xE
+// CHECK-FULLFP16-NOFML-VECTOR-SCALAR:     #define __ARM_FP16_FORMAT_IEEE 1
  
  // ================== Check whether -mtune accepts mixed-case features.
  // RUN: %clang -target aarch64 -mtune=CYCLONE -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-MTUNE-CYCLONE %s
diff --git a/utils/TableGen/NeonEmitter.cpp b/utils/TableGen/NeonEmitter.cpp

index df002eef120a062ab548aaf8f3c2d0f5422c6a17..f92110d5d7accd5015fdae46a311bd36dcf95fb1 100644 (file)
--- a/utils/TableGen/NeonEmitter.cpp
+++ b/utils/TableGen/NeonEmitter.cpp
@@ -494,6 +494,7 @@ private:
      std::pair<Type, std::string> emitDagSaveTemp(DagInit *DI);
      std::pair<Type, std::string> emitDagSplat(DagInit *DI);
      std::pair<Type, std::string> emitDagDup(DagInit *DI);
+    std::pair<Type, std::string> emitDagDupTyped(DagInit *DI);
      std::pair<Type, std::string> emitDagShuffle(DagInit *DI);
      std::pair<Type, std::string> emitDagCast(DagInit *DI, bool IsBitCast);
      std::pair<Type, std::string> emitDagCall(DagInit *DI);
@@ -897,6 +898,18 @@ void Type::applyModifier(char Mod) {
      Float = true;
      ElementBitwidth = 16;
      break;
+  case '0':
+    Float = true;
+    if (AppliedQuad)
+      Bitwidth /= 2;
+    ElementBitwidth = 16;
+    break;
+  case '1':
+    Float = true;
+    if (!AppliedQuad)
+      Bitwidth *= 2;
+    ElementBitwidth = 16;
+    break;
    case 'g':
      if (AppliedQuad)
        Bitwidth /= 2;
@@ -1507,6 +1520,8 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDag(DagInit *DI) {
      return emitDagShuffle(DI);
    if (Op == "dup")
      return emitDagDup(DI);
+  if (Op == "dup_typed")
+    return emitDagDupTyped(DI);
    if (Op == "splat")
      return emitDagSplat(DI);
    if (Op == "save_temp")
@@ -1771,6 +1786,28 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagDup(DagInit *DI) {
    return std::make_pair(T, S);
  }
  
+std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagDupTyped(DagInit *DI) {
+  assert_with_loc(DI->getNumArgs() == 2, "dup_typed() expects two arguments");
+  std::pair<Type, std::string> A = emitDagArg(DI->getArg(0),
+                                              DI->getArgNameStr(0));
+  std::pair<Type, std::string> B = emitDagArg(DI->getArg(1),
+                                              DI->getArgNameStr(1));
+  assert_with_loc(B.first.isScalar(),
+                  "dup_typed() requires a scalar as the second argument");
+
+  Type T = A.first;
+  assert_with_loc(T.isVector(), "dup_typed() used but target type is scalar!");
+  std::string S = "(" + T.str() + ") {";
+  for (unsigned I = 0; I < T.getNumElements(); ++I) {
+    if (I != 0)
+      S += ", ";
+    S += B.second;
+  }
+  S += "}";
+
+  return std::make_pair(T, S);
+}
+
  std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagSplat(DagInit *DI) {
    assert_with_loc(DI->getNumArgs() == 2, "splat() expects two arguments");
    std::pair<Type, std::string> A = emitDagArg(DI->getArg(0),
author	Bryan Chan <bryan.chan@huawei.com>
	Thu, 25 Oct 2018 23:47:00 +0000 (23:47 +0000)
committer	Bryan Chan <bryan.chan@huawei.com>
	Thu, 25 Oct 2018 23:47:00 +0000 (23:47 +0000)
include/clang/Basic/arm_neon.td		patch \| blob \| history
include/clang/Basic/arm_neon_incl.td		patch \| blob \| history
lib/Basic/Targets/AArch64.cpp		patch \| blob \| history
lib/Basic/Targets/AArch64.h		patch \| blob \| history
lib/CodeGen/CGBuiltin.cpp		patch \| blob \| history
test/CodeGen/aarch64-neon-fp16fml.c	[new file with mode: 0644]	patch \| blob
test/Preprocessor/aarch64-target-features.c		patch \| blob \| history
utils/TableGen/NeonEmitter.cpp		patch \| blob \| history