From 28647f65ca9348a7b245c0ca4684c3ee2fa4d56b Mon Sep 17 00:00:00 2001
From: Amara Emerson <aemerson@apple.com>
Date: Thu, 21 Mar 2019 22:31:37 +0000
Subject: [PATCH] [AArch64] Split the neon.addp intrinsic into integer and fp
 variants.

This is the result of discussions on the list about how to deal with intrinsics
which require codegen to disambiguate them via only the integer/fp overloads.
It causes problems for GlobalISel as some of that information is lost during
translation, while with other operations like IR instructions the information is
encoded into the instruction opcode.

This patch changes clang to emit the new faddp intrinsic if the vector operands
to the builtin have FP element types. LLVM IR AutoUpgrade has been taught to
upgrade existing calls to aarch64.neon.addp with fp vector arguments, and
we remove the workarounds introduced for GlobalISel in r355865.

This is a more permanent solution to PR40968.

Differential Revision: https://reviews.llvm.org/D59655

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@356722 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/CGBuiltin.cpp                    | 7 +++++++
 test/CodeGen/aarch64-neon-intrinsics.c       | 6 +++---
 test/CodeGen/aarch64-v8.2a-neon-intrinsics.c | 4 ++--
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/lib/CodeGen/CGBuiltin.cpp b/lib/CodeGen/CGBuiltin.cpp
index 4d9c76ae7f..5042d09227 100644
--- a/lib/CodeGen/CGBuiltin.cpp
+++ b/lib/CodeGen/CGBuiltin.cpp
@@ -5095,6 +5095,13 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr(
 
   switch (BuiltinID) {
   default: break;
+  case NEON::BI__builtin_neon_vpadd_v:
+  case NEON::BI__builtin_neon_vpaddq_v:
+    // We don't allow fp/int overloading of intrinsics.
+    if (VTy->getElementType()->isFloatingPointTy() &&
+        Int == Intrinsic::aarch64_neon_addp)
+      Int = Intrinsic::aarch64_neon_faddp;
+    break;
   case NEON::BI__builtin_neon_vabs_v:
   case NEON::BI__builtin_neon_vabsq_v:
     if (VTy->getElementType()->isFloatingPointTy())
diff --git a/test/CodeGen/aarch64-neon-intrinsics.c b/test/CodeGen/aarch64-neon-intrinsics.c
index 40e39912be..9a5b3a9f18 100644
--- a/test/CodeGen/aarch64-neon-intrinsics.c
+++ b/test/CodeGen/aarch64-neon-intrinsics.c
@@ -4411,7 +4411,7 @@ uint32x2_t test_vpadd_u32(uint32x2_t a, uint32x2_t b) {
 // CHECK-LABEL: @test_vpadd_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x float> %a to <8 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x float> %b to <8 x i8>
-// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.addp.v2f32(<2 x float> %a, <2 x float> %b)
+// CHECK:   [[VPADD_V2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %a, <2 x float> %b)
 // CHECK:   [[VPADD_V3_I:%.*]] = bitcast <2 x float> [[VPADD_V2_I]] to <8 x i8>
 // CHECK:   ret <2 x float> [[VPADD_V2_I]]
 float32x2_t test_vpadd_f32(float32x2_t a, float32x2_t b) {
@@ -4475,7 +4475,7 @@ uint32x4_t test_vpaddq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK-LABEL: @test_vpaddq_f32(
 // CHECK:   [[TMP0:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.addp.v4f32(<4 x float> %a, <4 x float> %b)
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %a, <4 x float> %b)
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <4 x float> [[VPADDQ_V2_I]] to <16 x i8>
 // CHECK:   ret <4 x float> [[VPADDQ_V2_I]]
 float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) {
@@ -4485,7 +4485,7 @@ float32x4_t test_vpaddq_f32(float32x4_t a, float32x4_t b) {
 // CHECK-LABEL: @test_vpaddq_f64(
 // CHECK:   [[TMP0:%.*]] = bitcast <2 x double> %a to <16 x i8>
 // CHECK:   [[TMP1:%.*]] = bitcast <2 x double> %b to <16 x i8>
-// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.addp.v2f64(<2 x double> %a, <2 x double> %b)
+// CHECK:   [[VPADDQ_V2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %a, <2 x double> %b)
 // CHECK:   [[VPADDQ_V3_I:%.*]] = bitcast <2 x double> [[VPADDQ_V2_I]] to <16 x i8>
 // CHECK:   ret <2 x double> [[VPADDQ_V2_I]]
 float64x2_t test_vpaddq_f64(float64x2_t a, float64x2_t b) {
diff --git a/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
index e1a2e3fb92..a4bf875336 100644
--- a/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
+++ b/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c
@@ -736,14 +736,14 @@ float16x8_t test_vmulxq_f16(float16x8_t a, float16x8_t b) {
 }
 
 // CHECK-LABEL: test_vpadd_f16
-// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.aarch64.neon.addp.v4f16(<4 x half> %a, <4 x half> %b)
+// CHECK:  [[ADD:%.*]] = call <4 x half> @llvm.aarch64.neon.faddp.v4f16(<4 x half> %a, <4 x half> %b)
 // CHECK:  ret <4 x half> [[ADD]]
 float16x4_t test_vpadd_f16(float16x4_t a, float16x4_t b) {
   return vpadd_f16(a, b);
 }
 
 // CHECK-LABEL: test_vpaddq_f16
-// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.aarch64.neon.addp.v8f16(<8 x half> %a, <8 x half> %b)
+// CHECK:  [[ADD:%.*]] = call <8 x half> @llvm.aarch64.neon.faddp.v8f16(<8 x half> %a, <8 x half> %b)
 // CHECK:  ret <8 x half> [[ADD]]
 float16x8_t test_vpaddq_f16(float16x8_t a, float16x8_t b) {
   return vpaddq_f16(a, b);
-- 
2.40.0