[CodeGen][ARM] Coerce FP16 vectors to integer vectors when needed

author Mikhail Maltsev <mikhail.maltsev@arm.com>

Wed, 12 Sep 2018 09:19:19 +0000 (09:19 +0000)

committer Mikhail Maltsev <mikhail.maltsev@arm.com>

Wed, 12 Sep 2018 09:19:19 +0000 (09:19 +0000)
author Mikhail Maltsev <mikhail.maltsev@arm.com>
Wed, 12 Sep 2018 09:19:19 +0000 (09:19 +0000)
committer Mikhail Maltsev <mikhail.maltsev@arm.com>
Wed, 12 Sep 2018 09:19:19 +0000 (09:19 +0000)
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp

index 9ad9674ff81bc79915448d358c5b79d3cce85ed0..32edfd853fb4bede3b415d16b16d2f83d479dbc1 100644 (file)
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -5549,6 +5549,9 @@ public:
  private:
    ABIArgInfo classifyReturnType(QualType RetTy, bool isVariadic) const;
    ABIArgInfo classifyArgumentType(QualType RetTy, bool isVariadic) const;
+  ABIArgInfo classifyHomogeneousAggregate(QualType Ty, const Type *Base,
+                                          uint64_t Members) const;
+  ABIArgInfo coerceIllegalVector(QualType Ty) const;
    bool isIllegalVectorType(QualType Ty) const;
  
    bool isHomogeneousAggregateBaseType(QualType Ty) const override;
@@ -5723,6 +5726,41 @@ void ARMABIInfo::setCCs() {
      RuntimeCC = abiCC;
  }
  
+ABIArgInfo ARMABIInfo::coerceIllegalVector(QualType Ty) const {
+  uint64_t Size = getContext().getTypeSize(Ty);
+  if (Size <= 32) {
+    llvm::Type *ResType =
+        llvm::Type::getInt32Ty(getVMContext());
+    return ABIArgInfo::getDirect(ResType);
+  }
+  if (Size == 64 || Size == 128) {
+    llvm::Type *ResType = llvm::VectorType::get(
+        llvm::Type::getInt32Ty(getVMContext()), Size / 32);
+    return ABIArgInfo::getDirect(ResType);
+  }
+  return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
+}
+
+ABIArgInfo ARMABIInfo::classifyHomogeneousAggregate(QualType Ty,
+                                                    const Type *Base,
+                                                    uint64_t Members) const {
+  assert(Base && "Base class should be set for homogeneous aggregate");
+  // Base can be a floating-point or a vector.
+  if (const VectorType *VT = Base->getAs<VectorType>()) {
+    // FP16 vectors should be converted to integer vectors
+    if (!getTarget().hasLegalHalfType() &&
+        (VT->getElementType()->isFloat16Type() ||
+          VT->getElementType()->isHalfType())) {
+      uint64_t Size = getContext().getTypeSize(VT);
+      llvm::Type *NewVecTy = llvm::VectorType::get(
+          llvm::Type::getInt32Ty(getVMContext()), Size / 32);
+      llvm::Type *Ty = llvm::ArrayType::get(NewVecTy, Members);
+      return ABIArgInfo::getDirect(Ty, 0, nullptr, false);
+    }
+  }
+  return ABIArgInfo::getDirect(nullptr, 0, nullptr, false);
+}
+
  ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty,
                                              bool isVariadic) const {
    // 6.1.2.1 The following argument types are VFP CPRCs:
@@ -5737,25 +5775,8 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty,
    Ty = useFirstFieldIfTransparentUnion(Ty);
  
    // Handle illegal vector types here.
-  if (isIllegalVectorType(Ty)) {
-    uint64_t Size = getContext().getTypeSize(Ty);
-    if (Size <= 32) {
-      llvm::Type *ResType =
-          llvm::Type::getInt32Ty(getVMContext());
-      return ABIArgInfo::getDirect(ResType);
-    }
-    if (Size == 64) {
-      llvm::Type *ResType = llvm::VectorType::get(
-          llvm::Type::getInt32Ty(getVMContext()), 2);
-      return ABIArgInfo::getDirect(ResType);
-    }
-    if (Size == 128) {
-      llvm::Type *ResType = llvm::VectorType::get(
-          llvm::Type::getInt32Ty(getVMContext()), 4);
-      return ABIArgInfo::getDirect(ResType);
-    }
-    return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
-  }
+  if (isIllegalVectorType(Ty))
+    return coerceIllegalVector(Ty);
  
    // _Float16 and __fp16 get passed as if it were an int or float, but with
    // the top 16 bits unspecified. This is not done for OpenCL as it handles the
@@ -5791,11 +5812,8 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty,
      // into VFP registers.
      const Type *Base = nullptr;
      uint64_t Members = 0;
-    if (isHomogeneousAggregate(Ty, Base, Members)) {
-      assert(Base && "Base class should be set for homogeneous aggregate");
-      // Base can be a floating-point or a vector.
-      return ABIArgInfo::getDirect(nullptr, 0, nullptr, false);
-    }
+    if (isHomogeneousAggregate(Ty, Base, Members))
+      return classifyHomogeneousAggregate(Ty, Base, Members);
    } else if (getABIKind() == ARMABIInfo::AAPCS16_VFP) {
      // WatchOS does have homogeneous aggregates. Note that we intentionally use
      // this convention even for a variadic function: the backend will use GPRs
@@ -5954,9 +5972,15 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy,
    if (RetTy->isVoidType())
      return ABIArgInfo::getIgnore();
  
-  // Large vector types should be returned via memory.
-  if (RetTy->isVectorType() && getContext().getTypeSize(RetTy) > 128) {
-    return getNaturalAlignIndirect(RetTy);
+  if (const VectorType *VT = RetTy->getAs<VectorType>()) {
+    // Large vector types should be returned via memory.
+    if (getContext().getTypeSize(RetTy) > 128)
+      return getNaturalAlignIndirect(RetTy);
+    // FP16 vectors should be converted to integer vectors
+    if (!getTarget().hasLegalHalfType() &&
+        (VT->getElementType()->isFloat16Type() ||
+         VT->getElementType()->isHalfType()))
+      return coerceIllegalVector(RetTy);
    }
  
    // _Float16 and __fp16 get returned as if it were an int or float, but with
@@ -6016,11 +6040,8 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy,
    if (IsEffectivelyAAPCS_VFP) {
      const Type *Base = nullptr;
      uint64_t Members = 0;
-    if (isHomogeneousAggregate(RetTy, Base, Members)) {
-      assert(Base && "Base class should be set for homogeneous aggregate");
-      // Homogeneous Aggregates are returned directly.
-      return ABIArgInfo::getDirect(nullptr, 0, nullptr, false);
-    }
+    if (isHomogeneousAggregate(RetTy, Base, Members))
+      return classifyHomogeneousAggregate(RetTy, Base, Members);
    }
  
    // Aggregates <= 4 bytes are returned in r0; other aggregates
@@ -6055,6 +6076,13 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy,
  /// isIllegalVector - check whether Ty is an illegal vector type.
  bool ARMABIInfo::isIllegalVectorType(QualType Ty) const {
    if (const VectorType *VT = Ty->getAs<VectorType> ()) {
+    // On targets that don't support FP16, FP16 is expanded into float, and we
+    // don't want the ABI to depend on whether or not FP16 is supported in
+    // hardware. Thus return false to coerce FP16 vectors into integer vectors.
+    if (!getTarget().hasLegalHalfType() &&
+        (VT->getElementType()->isFloat16Type() ||
+         VT->getElementType()->isHalfType()))
+      return true;
      if (isAndroid()) {
        // Android shipped using Clang 3.1, which supported a slightly different
        // vector ABI. The primary differences were that 3-element vector types
diff --git a/test/CodeGen/arm-vfp16-arguments.c b/test/CodeGen/arm-vfp16-arguments.c

new file mode 100644 (file)

index 0000000..32f1bb7
--- /dev/null
+++ b/test/CodeGen/arm-vfp16-arguments.c
@@ -0,0 +1,76 @@
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
+// RUN:   -mfloat-abi soft -target-feature +neon -emit-llvm -o - -O1 %s \
+// RUN:   | FileCheck %s --check-prefix=CHECK-SOFT
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
+// RUN:   -mfloat-abi hard -target-feature +neon -emit-llvm -o - -O1 %s \
+// RUN:   | FileCheck %s --check-prefix=CHECK-HARD
+// RUN: %clang_cc1 -triple armv7a--none-eabi -target-abi aapcs \
+// RUN:   -mfloat-abi hard -target-feature +neon -target-feature +fullfp16 \
+// RUN:   -emit-llvm -o - -O1 %s \
+// RUN:   | FileCheck %s --check-prefix=CHECK-FULL
+
+typedef __attribute__((neon_vector_type(4))) __fp16 float16x4_t;
+typedef __attribute__((neon_vector_type(8))) __fp16 float16x8_t;
+
+typedef struct { float16x4_t x[2]; } hfa_t;
+// CHECK-FULL: %struct.hfa_t = type { [2 x <4 x half>] }
+
+float16x4_t g4;
+float16x8_t g8;
+
+void st4(float16x4_t a) { g4 = a; }
+// CHECK-SOFT: define void @st4(<2 x i32> %a.coerce)
+// CHECK-SOFT: store <2 x i32> %a.coerce, <2 x i32>* bitcast (<4 x half>* @g4 to <2 x i32>*)
+//
+// CHECK-HARD: define arm_aapcs_vfpcc void @st4(<2 x i32> %a.coerce)
+// CHECK-HARD: store <2 x i32> %a.coerce, <2 x i32>* bitcast (<4 x half>* @g4 to <2 x i32>*)
+//
+// CHECK-FULL: define arm_aapcs_vfpcc void @st4(<4 x half> %a)
+// CHECK-FULL: store <4 x half> %a, <4 x half>* @g4
+
+float16x4_t ld4(void) { return g4; }
+// CHECK-SOFT: define <2 x i32> @ld4()
+// CHECK-SOFT: %0 = load <2 x i32>, <2 x i32>* bitcast (<4 x half>* @g4 to <2 x i32>*)
+// CHECK-SOFT: ret <2 x i32> %0
+//
+// CHECK-HARD: define arm_aapcs_vfpcc <2 x i32> @ld4()
+// CHECK-HARD: %0 = load <2 x i32>, <2 x i32>* bitcast (<4 x half>* @g4 to <2 x i32>*)
+// CHECK-HARD: ret <2 x i32> %0
+//
+// CHECK-FULL: define arm_aapcs_vfpcc <4 x half> @ld4()
+// CHECK-FULL: %0 = load <4 x half>, <4 x half>* @g4
+// CHECK-FULL: ret <4 x half> %0
+
+void st8(float16x8_t a) { g8 = a; }
+// CHECK-SOFT: define void @st8(<4 x i32> %a.coerce)
+// CHECK-SOFT: store <4 x i32> %a.coerce, <4 x i32>* bitcast (<8 x half>* @g8 to <4 x i32>*)
+//
+// CHECK-HARD: define arm_aapcs_vfpcc void @st8(<4 x i32> %a.coerce)
+// CHECK-HARD: store <4 x i32> %a.coerce, <4 x i32>* bitcast (<8 x half>* @g8 to <4 x i32>*)
+//
+// CHECK-FULL: define arm_aapcs_vfpcc void @st8(<8 x half> %a)
+// CHECK-FULL: store <8 x half> %a, <8 x half>* @g8
+
+float16x8_t ld8(void) { return g8; }
+// CHECK-SOFT: define <4 x i32> @ld8()
+// CHECK-SOFT: %0 = load <4 x i32>, <4 x i32>* bitcast (<8 x half>* @g8 to <4 x i32>*)
+// CHECK-SOFT: ret <4 x i32> %0
+//
+// CHECK-HARD: define arm_aapcs_vfpcc <4 x i32> @ld8()
+// CHECK-HARD: %0 = load <4 x i32>, <4 x i32>* bitcast (<8 x half>* @g8 to <4 x i32>*)
+// CHECK-HARD: ret <4 x i32> %0
+//
+// CHECK-FULL: define arm_aapcs_vfpcc <8 x half> @ld8()
+// CHECK-FULL: %0 = load <8 x half>, <8 x half>* @g8
+// CHECK-FULL: ret <8 x half> %0
+
+void test_hfa(hfa_t a) {}
+// CHECK-SOFT: define void @test_hfa([2 x i64] %a.coerce)
+// CHECK-HARD: define arm_aapcs_vfpcc void @test_hfa([2 x <2 x i32>] %a.coerce)
+// CHECK-FULL: define arm_aapcs_vfpcc void @test_hfa(%struct.hfa_t %a.coerce)
+
+hfa_t ghfa;
+hfa_t test_ret_hfa(void) { return ghfa; }
+// CHECK-SOFT: define void @test_ret_hfa(%struct.hfa_t* noalias nocapture sret %agg.result)
+// CHECK-HARD: define arm_aapcs_vfpcc [2 x <2 x i32>] @test_ret_hfa()
+// CHECK-FULL: define arm_aapcs_vfpcc %struct.hfa_t @test_ret_hfa()
diff --git a/test/CodeGen/arm_neon_intrinsics.c b/test/CodeGen/arm_neon_intrinsics.c

index f6305062e8fdcff83ec461f564d2c42669e1f18d..c0e0dee0d32e55e4c1f03d40f48cb8a4ec02e0c8 100644 (file)
--- a/test/CodeGen/arm_neon_intrinsics.c
+++ b/test/CodeGen/arm_neon_intrinsics.c
@@ -1,5 +1,6 @@
  // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
-// RUN:  -target-cpu swift -fallow-half-arguments-and-returns -ffreestanding \
+// RUN:  -target-cpu swift -fallow-half-arguments-and-returns \
+// RUN:  -target-feature +fullfp16 -ffreestanding \
  // RUN:  -disable-O0-optnone -emit-llvm -o - %s \
  // RUN:  | opt -S -mem2reg | FileCheck %s
  
@@ -3896,9 +3897,8 @@ int64x2_t test_vld1q_s64(int64_t const * a) {
  
  // CHECK-LABEL: @test_vld1q_f16(
  // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD1:%.*]] = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* [[TMP0]], i32 2)
-// CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> [[VLD1]] to <8 x half>
-// CHECK:   ret <8 x half> [[TMP1]]
+// CHECK:   [[VLD1:%.*]] = call <8 x half> @llvm.arm.neon.vld1.v8f16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <8 x half> [[VLD1]]
  float16x8_t test_vld1q_f16(float16_t const * a) {
    return vld1q_f16(a);
  }
@@ -3990,9 +3990,8 @@ int64x1_t test_vld1_s64(int64_t const * a) {
  
  // CHECK-LABEL: @test_vld1_f16(
  // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD1:%.*]] = call <4 x i16> @llvm.arm.neon.vld1.v4i16.p0i8(i8* [[TMP0]], i32 2)
-// CHECK:   [[TMP1:%.*]] = bitcast <4 x i16> [[VLD1]] to <4 x half>
-// CHECK:   ret <4 x half> [[TMP1]]
+// CHECK:   [[VLD1:%.*]] = call <4 x half> @llvm.arm.neon.vld1.v4f16.p0i8(i8* [[TMP0]], i32 2)
+// CHECK:   ret <4 x half> [[VLD1]]
  float16x4_t test_vld1_f16(float16_t const * a) {
    return vld1_f16(a);
  }
@@ -4106,12 +4105,11 @@ int64x2_t test_vld1q_dup_s64(int64_t const * a) {
  
  // CHECK-LABEL: @test_vld1q_dup_f16(
  // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = bitcast <8 x i16> [[LANE]] to <8 x half>
-// CHECK:   ret <8 x half> [[TMP4]]
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK:   [[TMP2:%.*]] = load half, half* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <8 x half> undef, half [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <8 x half> [[TMP3]], <8 x half> [[TMP3]], <8 x i32> zeroinitializer
+// CHECK:   ret <8 x half> [[LANE]]
  float16x8_t test_vld1q_dup_f16(float16_t const * a) {
    return vld1q_dup_f16(a);
  }
@@ -4233,12 +4231,11 @@ int64x1_t test_vld1_dup_s64(int64_t const * a) {
  
  // CHECK-LABEL: @test_vld1_dup_f16(
  // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
-// CHECK:   [[TMP3:%.*]] = insertelement <4 x i16> undef, i16 [[TMP2]], i32 0
-// CHECK:   [[LANE:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP3]], <4 x i32> zeroinitializer
-// CHECK:   [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <4 x half>
-// CHECK:   ret <4 x half> [[TMP4]]
+// CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK:   [[TMP2:%.*]] = load half, half* [[TMP1]], align 2
+// CHECK:   [[TMP3:%.*]] = insertelement <4 x half> undef, half [[TMP2]], i32 0
+// CHECK:   [[LANE:%.*]] = shufflevector <4 x half> [[TMP3]], <4 x half> [[TMP3]], <4 x i32> zeroinitializer
+// CHECK:   ret <4 x half> [[LANE]]
  float16x4_t test_vld1_dup_f16(float16_t const * a) {
    return vld1_dup_f16(a);
  }
@@ -4365,12 +4362,11 @@ int64x2_t test_vld1q_lane_s64(int64_t const * a, int64x2_t b) {
  // CHECK-LABEL: @test_vld1q_lane_f16(
  // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
  // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP4]], i32 7
-// CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[VLD1_LANE]] to <8 x half>
-// CHECK:   ret <8 x half> [[TMP5]]
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK:   [[TMP4:%.*]] = load half, half* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <8 x half> [[TMP2]], half [[TMP4]], i32 7
+// CHECK:   ret <8 x half> [[VLD1_LANE]]
  float16x8_t test_vld1q_lane_f16(float16_t const * a, float16x8_t b) {
    return vld1q_lane_f16(a, b, 7);
  }
@@ -4498,12 +4494,11 @@ int64x1_t test_vld1_lane_s64(int64_t const * a, int64x1_t b) {
  // CHECK-LABEL: @test_vld1_lane_f16(
  // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
  // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   [[TMP4:%.*]] = load i16, i16* [[TMP3]], align 2
-// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[TMP4]], i32 3
-// CHECK:   [[TMP5:%.*]] = bitcast <4 x i16> [[VLD1_LANE]] to <4 x half>
-// CHECK:   ret <4 x half> [[TMP5]]
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK:   [[TMP3:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK:   [[TMP4:%.*]] = load half, half* [[TMP3]], align 2
+// CHECK:   [[VLD1_LANE:%.*]] = insertelement <4 x half> [[TMP2]], half [[TMP4]], i32 3
+// CHECK:   ret <4 x half> [[VLD1_LANE]]
  float16x4_t test_vld1_lane_f16(float16_t const * a, float16x4_t b) {
    return vld1_lane_f16(a, b, 3);
  }
@@ -4596,7 +4591,7 @@ int32x4x2_t test_vld2q_s32(int32_t const * a) {
  // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
  // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
  // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
+// CHECK:   [[VLD2Q_V:%.*]] = call { <8 x half>, <8 x half>
  float16x8x2_t test_vld2q_f16(float16_t const * a) {
    return vld2q_f16(a);
  }
@@ -4701,7 +4696,7 @@ int64x1x2_t test_vld2_s64(int64_t const * a) {
  // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8
  // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8*
  // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD2_V:%.*]] = call { <4 x i16>, <4 x i16>
+// CHECK:   [[VLD2_V:%.*]] = call { <4 x half>, <4 x half>
  float16x4x2_t test_vld2_f16(float16_t const * a) {
    return vld2_f16(a);
  }
@@ -4860,9 +4855,9 @@ int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
  // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
  // CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
  // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
+// CHECK:   [[VLD2Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>
  float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
    return vld2q_lane_f16(a, b, 7);
  }
@@ -5093,9 +5088,9 @@ int32x2x2_t test_vld2_lane_s32(int32_t const * a, int32x2x2_t b) {
  // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
  // CHECK:   [[TMP7:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
  // CHECK:   [[TMP8:%.*]] = bitcast <4 x half> [[TMP7]] to <8 x i8>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
+// CHECK:   [[VLD2_LANE_V:%.*]] = call { <4 x half>, <4 x half>
  float16x4x2_t test_vld2_lane_f16(float16_t const * a, float16x4x2_t b) {
    return vld2_lane_f16(a, b, 3);
  }
@@ -5232,7 +5227,7 @@ int32x4x3_t test_vld3q_s32(int32_t const * a) {
  // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
  // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
  // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK:   [[VLD3Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>
  float16x8x3_t test_vld3q_f16(float16_t const * a) {
    return vld3q_f16(a);
  }
@@ -5337,7 +5332,7 @@ int64x1x3_t test_vld3_s64(int64_t const * a) {
  // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8
  // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8*
  // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD3_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK:   [[VLD3_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>
  float16x4x3_t test_vld3_f16(float16_t const * a) {
    return vld3_f16(a);
  }
@@ -5520,10 +5515,10 @@ int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
  // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
  // CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
  // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
+// CHECK:   [[VLD3Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>
  float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
    return vld3q_lane_f16(a, b, 7);
  }
@@ -5794,10 +5789,10 @@ int32x2x3_t test_vld3_lane_s32(int32_t const * a, int32x2x3_t b) {
  // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
  // CHECK:   [[TMP9:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
  // CHECK:   [[TMP10:%.*]] = bitcast <4 x half> [[TMP9]] to <8 x i8>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
+// CHECK:   [[VLD3_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>
  float16x4x3_t test_vld3_lane_f16(float16_t const * a, float16x4x3_t b) {
    return vld3_lane_f16(a, b, 3);
  }
@@ -5947,7 +5942,7 @@ int32x4x4_t test_vld4q_s32(int32_t const * a) {
  // CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
  // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
  // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK:   [[VLD4Q_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half>
  float16x8x4_t test_vld4q_f16(float16_t const * a) {
    return vld4q_f16(a);
  }
@@ -6052,7 +6047,7 @@ int64x1x4_t test_vld4_s64(int64_t const * a) {
  // CHECK:   [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8
  // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8*
  // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
-// CHECK:   [[VLD4_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK:   [[VLD4_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half>
  float16x4x4_t test_vld4_f16(float16_t const * a) {
    return vld4_f16(a);
  }
@@ -6259,11 +6254,11 @@ int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
  // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
  // CHECK:   [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
  // CHECK:   [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x i16>
-// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x i16>
-// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x half>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x half>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP10]] to <8 x half>
+// CHECK:   [[TMP16:%.*]] = bitcast <16 x i8> [[TMP12]] to <8 x half>
+// CHECK:   [[VLD4Q_LANE_V:%.*]] = call { <8 x half>, <8 x half>, <8 x half>, <8 x half>
  float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
    return vld4q_lane_f16(a, b, 7);
  }
@@ -6574,11 +6569,11 @@ int32x2x4_t test_vld4_lane_s32(int32_t const * a, int32x2x4_t b) {
  // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
  // CHECK:   [[TMP11:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
  // CHECK:   [[TMP12:%.*]] = bitcast <4 x half> [[TMP11]] to <8 x i8>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x i16>
-// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x i16>
-// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP6]] to <4 x half>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP8]] to <4 x half>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP10]] to <4 x half>
+// CHECK:   [[TMP16:%.*]] = bitcast <8 x i8> [[TMP12]] to <4 x half>
+// CHECK:   [[VLD4_LANE_V:%.*]] = call { <4 x half>, <4 x half>, <4 x half>, <4 x half>
  float16x4x4_t test_vld4_lane_f16(float16_t const * a, float16x4x4_t b) {
    return vld4_lane_f16(a, b, 3);
  }
@@ -15469,8 +15464,8 @@ void test_vst1q_s64(int64_t * a, int64x2_t b) {
  // CHECK-LABEL: @test_vst1q_f16(
  // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
  // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* [[TMP0]], <8 x i16> [[TMP2]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v8f16(i8* [[TMP0]], <8 x half> [[TMP2]], i32 2)
  // CHECK:   ret void
  void test_vst1q_f16(float16_t * a, float16x8_t b) {
    vst1q_f16(a, b);
@@ -15580,8 +15575,8 @@ void test_vst1_s64(int64_t * a, int64x1_t b) {
  // CHECK-LABEL: @test_vst1_f16(
  // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
  // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4i16(i8* [[TMP0]], <4 x i16> [[TMP2]], i32 2)
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK:   call void @llvm.arm.neon.vst1.p0i8.v4f16(i8* [[TMP0]], <4 x half> [[TMP2]], i32 2)
  // CHECK:   ret void
  void test_vst1_f16(float16_t * a, float16x4_t b) {
    vst1_f16(a, b);
@@ -15703,10 +15698,10 @@ void test_vst1q_lane_s64(int64_t * a, int64x2_t b) {
  // CHECK-LABEL: @test_vst1q_lane_f16(
  // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
  // CHECK:   [[TMP1:%.*]] = bitcast <8 x half> %b to <16 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK:   [[TMP3:%.*]] = extractelement <8 x half> [[TMP2]], i32 7
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK:   store half [[TMP3]], half* [[TMP4]], align 2
  // CHECK:   ret void
  void test_vst1q_lane_f16(float16_t * a, float16x8_t b) {
    vst1q_lane_f16(a, b, 7);
@@ -15835,10 +15830,10 @@ void test_vst1_lane_s64(int64_t * a, int64x1_t b) {
  // CHECK-LABEL: @test_vst1_lane_f16(
  // CHECK:   [[TMP0:%.*]] = bitcast half* %a to i8*
  // CHECK:   [[TMP1:%.*]] = bitcast <4 x half> %b to <8 x i8>
-// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x i16>
-// CHECK:   [[TMP3:%.*]] = extractelement <4 x i16> [[TMP2]], i32 3
-// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to i16*
-// CHECK:   store i16 [[TMP3]], i16* [[TMP4]], align 2
+// CHECK:   [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <4 x half>
+// CHECK:   [[TMP3:%.*]] = extractelement <4 x half> [[TMP2]], i32 3
+// CHECK:   [[TMP4:%.*]] = bitcast i8* [[TMP0]] to half*
+// CHECK:   store half [[TMP3]], half* [[TMP4]], align 2
  // CHECK:   ret void
  void test_vst1_lane_f16(float16_t * a, float16x4_t b) {
    vst1_lane_f16(a, b, 3);
@@ -16040,9 +16035,9 @@ void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
  // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
  // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
  // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 2)
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 2)
  // CHECK:   ret void
  void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
    vst2q_f16(a, b);
@@ -16337,9 +16332,9 @@ void test_vst2_s64(int64_t * a, int64x1x2_t b) {
  // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
  // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
  // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 2)
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
+// CHECK:   call void @llvm.arm.neon.vst2.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 2)
  // CHECK:   ret void
  void test_vst2_f16(float16_t * a, float16x4x2_t b) {
    vst2_f16(a, b);
@@ -16540,9 +16535,9 @@ void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
  // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
  // CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
  // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP8]], <8 x i16> [[TMP9]], i32 7, i32 2)
+// CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP8]], <8 x half> [[TMP9]], i32 7, i32 2)
  // CHECK:   ret void
  void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
    vst2q_lane_f16(a, b, 7);
@@ -16764,9 +16759,9 @@ void test_vst2_lane_s32(int32_t * a, int32x2x2_t b) {
  // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x half>], [2 x <4 x half>]* [[VAL1]], i32 0, i32 1
  // CHECK:   [[TMP6:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX2]], align 8
  // CHECK:   [[TMP7:%.*]] = bitcast <4 x half> [[TMP6]] to <8 x i8>
-// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP8]], <4 x i16> [[TMP9]], i32 3, i32 2)
+// CHECK:   [[TMP8:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK:   [[TMP9:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
+// CHECK:   call void @llvm.arm.neon.vst2lane.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP8]], <4 x half> [[TMP9]], i32 3, i32 2)
  // CHECK:   ret void
  void test_vst2_lane_f16(float16_t * a, float16x4x2_t b) {
    vst2_lane_f16(a, b, 3);
@@ -17039,10 +17034,10 @@ void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
  // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
  // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
  // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 2)
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 2)
  // CHECK:   ret void
  void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
    vst3q_f16(a, b);
@@ -17390,10 +17385,10 @@ void test_vst3_s64(int64_t * a, int64x1x3_t b) {
  // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
  // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
  // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 2)
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
+// CHECK:   call void @llvm.arm.neon.vst3.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 2)
  // CHECK:   ret void
  void test_vst3_f16(float16_t * a, float16x4x3_t b) {
    vst3_f16(a, b);
@@ -17631,10 +17626,10 @@ void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
  // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
  // CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
  // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP10]], <8 x i16> [[TMP11]], <8 x i16> [[TMP12]], i32 7, i32 2)
+// CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP10]], <8 x half> [[TMP11]], <8 x half> [[TMP12]], i32 7, i32 2)
  // CHECK:   ret void
  void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
    vst3q_lane_f16(a, b, 7);
@@ -17896,10 +17891,10 @@ void test_vst3_lane_s32(int32_t * a, int32x2x3_t b) {
  // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x half>], [3 x <4 x half>]* [[VAL3]], i32 0, i32 2
  // CHECK:   [[TMP8:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX4]], align 8
  // CHECK:   [[TMP9:%.*]] = bitcast <4 x half> [[TMP8]] to <8 x i8>
-// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP10]], <4 x i16> [[TMP11]], <4 x i16> [[TMP12]], i32 3, i32 2)
+// CHECK:   [[TMP10:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK:   [[TMP11:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
+// CHECK:   call void @llvm.arm.neon.vst3lane.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP10]], <4 x half> [[TMP11]], <4 x half> [[TMP12]], i32 3, i32 2)
  // CHECK:   ret void
  void test_vst3_lane_f16(float16_t * a, float16x4x3_t b) {
    vst3_lane_f16(a, b, 3);
@@ -18215,11 +18210,11 @@ void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
  // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
  // CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
  // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 2)
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 2)
  // CHECK:   ret void
  void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
    vst4q_f16(a, b);
@@ -18620,11 +18615,11 @@ void test_vst4_s64(int64_t * a, int64x1x4_t b) {
  // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
  // CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
  // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 2)
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
+// CHECK:   call void @llvm.arm.neon.vst4.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 2)
  // CHECK:   ret void
  void test_vst4_f16(float16_t * a, float16x4x4_t b) {
    vst4_f16(a, b);
@@ -18899,11 +18894,11 @@ void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
  // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
  // CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
  // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8i16(i8* [[TMP3]], <8 x i16> [[TMP12]], <8 x i16> [[TMP13]], <8 x i16> [[TMP14]], <8 x i16> [[TMP15]], i32 7, i32 2)
+// CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x half>
+// CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x half>
+// CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP9]] to <8 x half>
+// CHECK:   [[TMP15:%.*]] = bitcast <16 x i8> [[TMP11]] to <8 x half>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v8f16(i8* [[TMP3]], <8 x half> [[TMP12]], <8 x half> [[TMP13]], <8 x half> [[TMP14]], <8 x half> [[TMP15]], i32 7, i32 2)
  // CHECK:   ret void
  void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
    vst4q_lane_f16(a, b, 7);
@@ -19205,11 +19200,11 @@ void test_vst4_lane_s32(int32_t * a, int32x2x4_t b) {
  // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x half>], [4 x <4 x half>]* [[VAL5]], i32 0, i32 3
  // CHECK:   [[TMP10:%.*]] = load <4 x half>, <4 x half>* [[ARRAYIDX6]], align 8
  // CHECK:   [[TMP11:%.*]] = bitcast <4 x half> [[TMP10]] to <8 x i8>
-// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x i16>
-// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16>
-// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x i16>
-// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x i16>
-// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4i16(i8* [[TMP3]], <4 x i16> [[TMP12]], <4 x i16> [[TMP13]], <4 x i16> [[TMP14]], <4 x i16> [[TMP15]], i32 3, i32 2)
+// CHECK:   [[TMP12:%.*]] = bitcast <8 x i8> [[TMP5]] to <4 x half>
+// CHECK:   [[TMP13:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x half>
+// CHECK:   [[TMP14:%.*]] = bitcast <8 x i8> [[TMP9]] to <4 x half>
+// CHECK:   [[TMP15:%.*]] = bitcast <8 x i8> [[TMP11]] to <4 x half>
+// CHECK:   call void @llvm.arm.neon.vst4lane.p0i8.v4f16(i8* [[TMP3]], <4 x half> [[TMP12]], <4 x half> [[TMP13]], <4 x half> [[TMP14]], <4 x half> [[TMP15]], i32 3, i32 2)
  // CHECK:   ret void
  void test_vst4_lane_f16(float16_t * a, float16x4x4_t b) {
    vst4_lane_f16(a, b, 3);
author	Mikhail Maltsev <mikhail.maltsev@arm.com>
	Wed, 12 Sep 2018 09:19:19 +0000 (09:19 +0000)
committer	Mikhail Maltsev <mikhail.maltsev@arm.com>
	Wed, 12 Sep 2018 09:19:19 +0000 (09:19 +0000)
lib/CodeGen/TargetInfo.cpp		patch \| blob \| history
test/CodeGen/arm-vfp16-arguments.c	[new file with mode: 0644]	patch \| blob
test/CodeGen/arm_neon_intrinsics.c		patch \| blob \| history